diff options
| author | Michael Kruse <llvm-project@meinersbur.de> | 2025-01-03 10:22:51 +0100 |
|---|---|---|
| committer | Michael Kruse <llvm-project@meinersbur.de> | 2025-01-03 10:22:51 +0100 |
| commit | 38500d63e14ce340236840f60d356cdefb56a52c (patch) | |
| tree | 17edbec446ce9b50d2f215a483b83afb293a635d /llvm/test/CodeGen/AMDGPU | |
| parent | 1a3d5daaef7a6a63448a497da3eff7fc9e23df26 (diff) | |
| parent | 27f30029741ecf023baece7b3dde1ff9011ffefc (diff) | |
Merge branch 'main' into users/meinersbur/flang_runtime_split-headersusers/meinersbur/flang_runtime_split-headers
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
166 files changed, 16141 insertions, 6027 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index dc9e1f244383..d62da6921b34 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -4,6 +4,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. @@ -85,6 +86,18 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 ret void @@ -162,6 +175,18 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -221,6 +246,16 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: ds_inc_u32 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 ret void } @@ -278,6 +313,16 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: ds_inc_u32 v1, v0 offset:16 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 ret void @@ -350,6 +395,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 ret void @@ -426,6 +482,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -503,6 +570,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32_offset_sistem: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -567,6 +646,16 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 ret void } @@ -633,6 +722,16 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 ret void @@ -700,6 +799,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 ret void @@ -788,6 +898,19 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id @@ -867,6 +990,18 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 @@ -956,6 +1091,23 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_shl_base_lds_0_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX12-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 @@ -1042,6 +1194,19 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -1124,6 +1289,19 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1188,6 +1366,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: ds_inc_u64 v2, v[0:1] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 ret void } @@ -1250,6 +1439,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 ret void @@ -1327,6 +1527,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -1408,6 +1620,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1490,6 +1714,19 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1559,6 +1796,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 ret void } @@ -1630,6 +1878,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 ret void @@ -1702,6 +1961,18 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 ret void @@ -1795,6 +2066,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id @@ -1879,6 +2164,19 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 @@ -1961,6 +2259,19 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr %out, align 4 ret void @@ -2047,6 +2358,19 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr %out, align 4 @@ -2134,6 +2458,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 store i32 %result, ptr %out, align 4 @@ -2203,6 +2541,17 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 ret void } @@ -2276,6 +2625,17 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 ret void @@ -2350,6 +2710,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 ret void @@ -2464,6 +2836,27 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_store_b32 v[0:1], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_store_b32 v[0:1], v3 +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id @@ -2560,6 +2953,23 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 @@ -2655,6 +3065,25 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_shl_base_lds_0_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 9 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v2 +; GFX12-NEXT: v_add_nc_u32_e32 v2, 2, v2 +; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b32 v3, v2, s[2:3] +; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 @@ -2754,6 +3183,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void @@ -2855,6 +3298,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 @@ -2957,6 +3414,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 @@ -3031,6 +3503,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3109,6 +3593,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void @@ -3188,6 +3684,19 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void @@ -3313,6 +3822,28 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id @@ -3413,6 +3944,23 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 @@ -3514,6 +4062,25 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: nocse_lds_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x10 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_endpgm %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result0, ptr addrspace(1) %out0, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll index 5dae7885f6bf..aefcad491073 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll @@ -1,25 +1,69 @@ -; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s +; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s ; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4) ; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4 ; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) -; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 -; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca - define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %n = load i32, ptr addrspace(1) %gep %alloca = alloca i32, i32 %n, align 4, addrspace(5) - store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align) +; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align +; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca + +define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id + %n = load i32, ptr addrspace(1) %gep + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64) +; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64 +; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca + +define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id + %n = load i32, ptr addrspace(1) %gep + %alloca = alloca i32, i32 %n, align 64, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca ret void } +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 +; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca + define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) { %alloca = alloca i32, i32 %n, align 4, addrspace(5) - store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + store volatile i32 456, ptr addrspace(5) %alloca + ret void +} + +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align +; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca + +define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) { + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 456, ptr addrspace(5) %alloca + ret void +} +; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64 +; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca + +define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) { + %alloca = alloca i32, i32 %n, align 64, addrspace(5) + store volatile i32 456, ptr addrspace(5) %alloca ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index 741323a201d0..ae055ea04129 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -8,52 +8,55 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 s0, s0, s17 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mov_b32 s4, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 -; GFX9-NEXT: s_and_b32 s4, s4, -16 -; GFX9-NEXT: s_lshl_b32 s4, s4, 6 -; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: s_lshl2_add_u32 s5, s5, 15 +; GFX9-NEXT: s_and_b32 s5, s5, -16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshl_b32 s5, s5, 6 ; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_add_u32 s32, s4, s5 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_add_u32 s0, s0, s17 +; GFX10-NEXT: s_mov_b32 s4, s32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b32 s33, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 -; GFX10-NEXT: s_and_b32 s4, s4, -16 -; GFX10-NEXT: s_lshl_b32 s4, s4, 5 -; GFX10-NEXT: s_add_u32 s4, s32, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s5, s5, 15 +; GFX10-NEXT: s_and_b32 s5, s5, -16 +; GFX10-NEXT: s_lshl_b32 s5, s5, 5 +; GFX10-NEXT: s_add_u32 s32, s4, s5 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_mov_b32 s0, s32 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, s0, -16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_and_b32 s1, s1, -16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s0, s32, s0 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_add_u32 s32, s0, s1 ; GFX11-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 4, addrspace(5) store i32 0, ptr addrspace(5) %alloca @@ -64,24 +67,25 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s7, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s6, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 -; GFX9-NEXT: s_add_u32 s4, s32, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_add_u32 s32, s6, s4 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -89,31 +93,32 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s6, s32 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 -; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_add_u32 s32, s6, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -121,7 +126,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s2, s32 +; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -129,10 +136,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s32, s2, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 4 %alloca = alloca i32, i32 %n, addrspace(5) @@ -143,52 +149,55 @@ define void @func_dynamic_stackalloc_sgpr_align4() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 s0, s0, s17 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_mov_b32 s4, s32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 -; GFX9-NEXT: s_and_b32 s4, s4, -16 -; GFX9-NEXT: s_lshl_b32 s4, s4, 6 -; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: s_lshl2_add_u32 s5, s5, 15 +; GFX9-NEXT: s_and_b32 s5, s5, -16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshl_b32 s5, s5, 6 ; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_add_u32 s32, s4, s5 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: s_add_u32 s0, s0, s17 +; GFX10-NEXT: s_mov_b32 s4, s32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b32 s33, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 -; GFX10-NEXT: s_and_b32 s4, s4, -16 -; GFX10-NEXT: s_lshl_b32 s4, s4, 5 -; GFX10-NEXT: s_add_u32 s4, s32, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_mov_b32 s33, 0 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s5, s5, 15 +; GFX10-NEXT: s_and_b32 s5, s5, -16 +; GFX10-NEXT: s_lshl_b32 s5, s5, 5 +; GFX10-NEXT: s_add_u32 s32, s4, s5 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_mov_b32 s0, s32 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, s0, -16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_and_b32 s1, s1, -16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s0, s32, s0 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_add_u32 s32, s0, s1 ; GFX11-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 16, addrspace(5) store i32 0, ptr addrspace(5) %alloca @@ -199,24 +208,25 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s7, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s6, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 -; GFX9-NEXT: s_add_u32 s4, s32, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_add_u32 s32, s6, s4 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -224,31 +234,32 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s6, s32 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 -; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_add_u32 s32, s6, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -256,7 +267,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s2, s32 +; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -264,10 +277,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s32, s2, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 16 %alloca = alloca i32, i32 %n, addrspace(5) @@ -279,37 +291,39 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_add_u32 s5, s32, 0x7ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-NEXT: s_and_b32 s5, s5, 0xfffff800 ; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 -; GFX9-NEXT: s_add_u32 s4, s32, s4 -; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_add_u32 s32, s5, s4 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: s_add_u32 s0, s0, s17 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_movk_i32 s32, 0x400 +; GFX10-NEXT: s_add_u32 s5, s32, 0x3ff ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_and_b32 s5, s5, 0xfffffc00 ; GFX10-NEXT: s_mov_b32 s33, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 -; GFX10-NEXT: s_add_u32 s4, s32, s4 -; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_add_u32 s32, s5, s4 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32: @@ -317,16 +331,17 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_u32 s1, s32, 0x3ff ; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_and_b32 s1, s1, 0xfffffc00 +; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s0, s32, s0 -; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s32, s1, s0 ; GFX11-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 32, addrspace(5) store i32 0, ptr addrspace(5) %alloca @@ -349,14 +364,15 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s5, s32, 0x7ff +; GFX9-NEXT: s_and_b32 s5, s5, 0xfffff800 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 -; GFX9-NEXT: s_add_u32 s4, s32, s4 -; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_add_u32 s32, s5, s4 ; GFX9-NEXT: s_addk_i32 s32, 0xf000 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -376,15 +392,16 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s5, s32, 0x3ff +; GFX10-NEXT: s_and_b32 s5, s5, 0xfffffc00 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 -; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_add_u32 s32, s5, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xf800 -; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32: @@ -402,16 +419,18 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_add_u32 s1, s32, 0x3ff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s1, s1, 0xfffffc00 +; GFX11-NEXT: scratch_store_b32 off, v0, s1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_add_u32 s32, s1, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s0, s32, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xffc0 -; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 -; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv %alloca = alloca i32, i32 %n, align 32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir index fb7c2d4d705e..95d2bae98df2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir @@ -274,24 +274,18 @@ body: | ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX8-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ASHRREV_I16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ASHRREV_I16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir index 779312596313..3a2ed71e4d22 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir @@ -79,9 +79,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CTPOP %0 @@ -104,9 +103,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY1]], [[V_BCNT_U32_B32_e64_]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CTPOP %0 @@ -155,9 +153,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CTPOP %0 @@ -181,9 +178,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], [[COPY]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CTPOP %1 @@ -207,9 +203,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[S_BCNT1_I32_B32_:%[0-9]+]]:sreg_32 = S_BCNT1_I32_B32 [[COPY]], implicit-def dead $scc - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[S_BCNT1_I32_B32_]], [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_CTPOP %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir index e7ec5fcbba24..a96b574a6478 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir @@ -272,24 +272,18 @@ body: | ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX8-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHRREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHRREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir index 9dc53bd1dc0b..22dd12eac092 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir @@ -24,6 +24,7 @@ body: | ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]] + ; ; GFX9-LABEL: name: smed3_s16_vvv ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -32,14 +33,15 @@ body: | ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]] + ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -75,6 +77,7 @@ body: | ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_]] + ; ; GFX9-LABEL: name: smed3_s16_vvv_multiuse0 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -84,6 +87,7 @@ body: | ; GFX9-NEXT: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv_multiuse0 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -91,8 +95,8 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[V_MAX_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec - ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_fake16_e64_]] + ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]], implicit [[V_MAX_I16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -128,6 +132,7 @@ body: | ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MIN_I16_e64_]] + ; ; GFX9-LABEL: name: smed3_s16_vvv_multiuse1 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -137,6 +142,7 @@ body: | ; GFX9-NEXT: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv_multiuse1 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -144,8 +150,8 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[V_MIN_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec - ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_fake16_e64_]] + ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]], implicit [[V_MIN_I16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -181,6 +187,7 @@ body: | ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_1]] + ; ; GFX9-LABEL: name: smed3_s16_vvv_multiuse2 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -191,6 +198,7 @@ body: | ; GFX9-NEXT: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv_multiuse2 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -199,8 +207,8 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[V_MIN_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec ; GFX11-NEXT: [[V_MAX_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_fake16_e64 [[V_MIN_I16_fake16_e64_]], [[COPY2]], implicit $exec - ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_fake16_e64_]] + ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]], implicit [[V_MAX_I16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir index 6928c963a5fc..6e1489e3227d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir @@ -24,6 +24,7 @@ body: | ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]] + ; ; GFX9-LABEL: name: umed3_s16_vvv ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -32,14 +33,15 @@ body: | ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]] + ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -75,6 +77,7 @@ body: | ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_]] + ; ; GFX9-LABEL: name: umed3_s16_vvv_multiuse0 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -84,6 +87,7 @@ body: | ; GFX9-NEXT: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv_multiuse0 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -91,8 +95,8 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[V_MAX_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec - ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_fake16_e64_]] + ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]], implicit [[V_MAX_U16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -128,6 +132,7 @@ body: | ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MIN_U16_e64_]] + ; ; GFX9-LABEL: name: umed3_s16_vvv_multiuse1 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -137,6 +142,7 @@ body: | ; GFX9-NEXT: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv_multiuse1 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -144,8 +150,8 @@ body: | ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[V_MIN_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec - ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_fake16_e64_]] + ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]], implicit [[V_MIN_U16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -181,6 +187,7 @@ body: | ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_1]] + ; ; GFX9-LABEL: name: umed3_s16_vvv_multiuse2 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} @@ -191,6 +198,7 @@ body: | ; GFX9-NEXT: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv_multiuse2 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -199,8 +207,8 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: [[V_MIN_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec ; GFX11-NEXT: [[V_MAX_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_fake16_e64 [[V_MIN_U16_fake16_e64_]], [[COPY2]], implicit $exec - ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_fake16_e64_]] + ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]], implicit [[V_MAX_U16_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir index ae4e5feb0d74..130f87e44eac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s --- name: sitofp_i32_to_f32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir index dac85561208d..d80a13c4d7c7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s --- name: fadd_f32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir index e065e09766dd..c75a2926e7cf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s --- name: f32_olt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir index bcb6d75c1830..b0703a642e03 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir @@ -272,24 +272,18 @@ body: | ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir index 66e95921679e..c73471139e87 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -535,19 +535,16 @@ body: | ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) - ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL1]] + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]] ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]] ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) - ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL3]] + ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL3]] ; CHECK-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) ; CHECK-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>) ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<4 x s16>) = G_AND [[CONCAT_VECTORS2]], [[CONCAT_VECTORS3]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir index 70f7e7ae623c..f47c9b89a81d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir @@ -763,89 +763,61 @@ body: | ; CHECK-NEXT: [[OR63:%[0-9]+]]:_(s32) = G_OR [[OR62]], [[SHL63]] ; CHECK-NEXT: [[SHL64:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C3]](s32) ; CHECK-NEXT: [[OR64:%[0-9]+]]:_(s32) = G_OR [[OR63]], [[SHL64]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL65:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C4]](s32) + ; CHECK-NEXT: [[SHL65:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C4]](s32) ; CHECK-NEXT: [[OR65:%[0-9]+]]:_(s32) = G_OR [[OR64]], [[SHL65]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL66:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C5]](s32) + ; CHECK-NEXT: [[SHL66:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C5]](s32) ; CHECK-NEXT: [[OR66:%[0-9]+]]:_(s32) = G_OR [[OR65]], [[SHL66]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL67:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C6]](s32) + ; CHECK-NEXT: [[SHL67:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C6]](s32) ; CHECK-NEXT: [[OR67:%[0-9]+]]:_(s32) = G_OR [[OR66]], [[SHL67]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL68:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C7]](s32) + ; CHECK-NEXT: [[SHL68:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C7]](s32) ; CHECK-NEXT: [[OR68:%[0-9]+]]:_(s32) = G_OR [[OR67]], [[SHL68]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL69:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C8]](s32) + ; CHECK-NEXT: [[SHL69:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C8]](s32) ; CHECK-NEXT: [[OR69:%[0-9]+]]:_(s32) = G_OR [[OR68]], [[SHL69]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL70:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C9]](s32) + ; CHECK-NEXT: [[SHL70:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C9]](s32) ; CHECK-NEXT: [[OR70:%[0-9]+]]:_(s32) = G_OR [[OR69]], [[SHL70]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL71:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C10]](s32) + ; CHECK-NEXT: [[SHL71:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C10]](s32) ; CHECK-NEXT: [[OR71:%[0-9]+]]:_(s32) = G_OR [[OR70]], [[SHL71]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL72:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C11]](s32) + ; CHECK-NEXT: [[SHL72:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C11]](s32) ; CHECK-NEXT: [[OR72:%[0-9]+]]:_(s32) = G_OR [[OR71]], [[SHL72]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL73:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C12]](s32) + ; CHECK-NEXT: [[SHL73:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C12]](s32) ; CHECK-NEXT: [[OR73:%[0-9]+]]:_(s32) = G_OR [[OR72]], [[SHL73]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL74:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C13]](s32) + ; CHECK-NEXT: [[SHL74:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C13]](s32) ; CHECK-NEXT: [[OR74:%[0-9]+]]:_(s32) = G_OR [[OR73]], [[SHL74]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL75:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C14]](s32) + ; CHECK-NEXT: [[SHL75:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C14]](s32) ; CHECK-NEXT: [[OR75:%[0-9]+]]:_(s32) = G_OR [[OR74]], [[SHL75]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL76:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C15]](s32) + ; CHECK-NEXT: [[SHL76:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C15]](s32) ; CHECK-NEXT: [[OR76:%[0-9]+]]:_(s32) = G_OR [[OR75]], [[SHL76]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL77:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[C16]](s32) + ; CHECK-NEXT: [[SHL77:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C16]](s32) ; CHECK-NEXT: [[OR77:%[0-9]+]]:_(s32) = G_OR [[OR76]], [[SHL77]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL78:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C17]](s32) + ; CHECK-NEXT: [[SHL78:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C17]](s32) ; CHECK-NEXT: [[OR78:%[0-9]+]]:_(s32) = G_OR [[OR77]], [[SHL78]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL79:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C18]](s32) + ; CHECK-NEXT: [[SHL79:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C18]](s32) ; CHECK-NEXT: [[OR79:%[0-9]+]]:_(s32) = G_OR [[OR78]], [[SHL79]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL80:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C19]](s32) + ; CHECK-NEXT: [[SHL80:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C19]](s32) ; CHECK-NEXT: [[OR80:%[0-9]+]]:_(s32) = G_OR [[OR79]], [[SHL80]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL81:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C20]](s32) + ; CHECK-NEXT: [[SHL81:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C20]](s32) ; CHECK-NEXT: [[OR81:%[0-9]+]]:_(s32) = G_OR [[OR80]], [[SHL81]] - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL82:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C21]](s32) + ; CHECK-NEXT: [[SHL82:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C21]](s32) ; CHECK-NEXT: [[OR82:%[0-9]+]]:_(s32) = G_OR [[OR81]], [[SHL82]] - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL83:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C22]](s32) + ; CHECK-NEXT: [[SHL83:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C22]](s32) ; CHECK-NEXT: [[OR83:%[0-9]+]]:_(s32) = G_OR [[OR82]], [[SHL83]] - ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL84:%[0-9]+]]:_(s32) = G_SHL [[COPY21]], [[C23]](s32) + ; CHECK-NEXT: [[SHL84:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C23]](s32) ; CHECK-NEXT: [[OR84:%[0-9]+]]:_(s32) = G_OR [[OR83]], [[SHL84]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL85:%[0-9]+]]:_(s32) = G_SHL [[COPY22]], [[C24]](s32) + ; CHECK-NEXT: [[SHL85:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C24]](s32) ; CHECK-NEXT: [[OR85:%[0-9]+]]:_(s32) = G_OR [[OR84]], [[SHL85]] - ; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL86:%[0-9]+]]:_(s32) = G_SHL [[COPY23]], [[C25]](s32) + ; CHECK-NEXT: [[SHL86:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C25]](s32) ; CHECK-NEXT: [[OR86:%[0-9]+]]:_(s32) = G_OR [[OR85]], [[SHL86]] - ; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL87:%[0-9]+]]:_(s32) = G_SHL [[COPY24]], [[C26]](s32) + ; CHECK-NEXT: [[SHL87:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C26]](s32) ; CHECK-NEXT: [[OR87:%[0-9]+]]:_(s32) = G_OR [[OR86]], [[SHL87]] - ; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL88:%[0-9]+]]:_(s32) = G_SHL [[COPY25]], [[C27]](s32) + ; CHECK-NEXT: [[SHL88:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C27]](s32) ; CHECK-NEXT: [[OR88:%[0-9]+]]:_(s32) = G_OR [[OR87]], [[SHL88]] - ; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL89:%[0-9]+]]:_(s32) = G_SHL [[COPY26]], [[C28]](s32) + ; CHECK-NEXT: [[SHL89:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C28]](s32) ; CHECK-NEXT: [[OR89:%[0-9]+]]:_(s32) = G_OR [[OR88]], [[SHL89]] - ; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL90:%[0-9]+]]:_(s32) = G_SHL [[COPY27]], [[C29]](s32) + ; CHECK-NEXT: [[SHL90:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C29]](s32) ; CHECK-NEXT: [[OR90:%[0-9]+]]:_(s32) = G_OR [[OR89]], [[SHL90]] - ; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL91:%[0-9]+]]:_(s32) = G_SHL [[COPY28]], [[C30]](s32) + ; CHECK-NEXT: [[SHL91:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C30]](s32) ; CHECK-NEXT: [[OR91:%[0-9]+]]:_(s32) = G_OR [[OR90]], [[SHL91]] - ; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL92:%[0-9]+]]:_(s32) = G_SHL [[COPY29]], [[C31]](s32) + ; CHECK-NEXT: [[SHL92:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C31]](s32) ; CHECK-NEXT: [[OR92:%[0-9]+]]:_(s32) = G_OR [[OR91]], [[SHL92]] ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR30]](s32), [[OR61]](s32), [[OR92]](s32) ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s68) = G_TRUNC [[MV]](s96) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir index 483698e01417..ef7759f5120f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir @@ -538,19 +538,16 @@ body: | ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) - ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL1]] + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]] ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]] ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) - ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL3]] + ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL3]] ; CHECK-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) ; CHECK-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>) ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(<4 x s16>) = G_OR [[CONCAT_VECTORS2]], [[CONCAT_VECTORS3]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir index d82e8328f26e..f1c4994c6f76 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir @@ -134,8 +134,7 @@ body: | ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C1]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32) ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST2]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir index a99a34f09046..62d3fb8b3db7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir @@ -57,9 +57,8 @@ body: | ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C2]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -99,12 +98,11 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; @@ -381,9 +379,8 @@ body: | ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C2]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -423,12 +420,11 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir index 9353813a8dbf..a993afc22b0c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir @@ -537,19 +537,16 @@ body: | ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) - ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL1]] + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]] ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]] ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32) ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) - ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL3]] + ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL3]] ; CHECK-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) ; CHECK-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>) ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(<4 x s16>) = G_XOR [[CONCAT_VECTORS2]], [[CONCAT_VECTORS3]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir index 8db6a7c78aaa..0b34dffc5004 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir @@ -726,15 +726,14 @@ body: | ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32) ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[UV]], [[SHL2]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) + ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[UV1]], [[SHL3]] ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[EXTRACT:%[0-9]+]]:_(s48) = G_EXTRACT [[DEF]](s64), 0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C3]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[C3]](s64) ; CHECK-NEXT: [[EXTRACT1:%[0-9]+]]:_(s48) = G_EXTRACT [[MV1]](s64), 0 - ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[COPY3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[COPY2]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[EXTRACT]](s48) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[EXTRACT1]](s48) ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[ANYEXT1]] @@ -751,17 +750,17 @@ body: | ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C1]] ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) - ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL5]] + ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL5]] ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C1]] ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL6]] ; CHECK-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32) ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL5]] ; CHECK-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[OR8]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR7]](s32) - ; CHECK-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[UV4]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR7]](s32) + ; CHECK-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY3]](s32), [[UV4]](s32) ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] - ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL3]] + ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL3]] ; CHECK-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR9]](s32), [[OR10]](s32) ; CHECK-NEXT: [[MV7:%[0-9]+]]:_(s384) = G_MERGE_VALUES [[AND1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64) ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s112) = G_TRUNC [[MV7]](s384) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll index de46037e96e8..1813003181d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -1,25 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f32: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15 -; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 -; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 -; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done -; GCN-NEXT: s_endpgm +; GFX11-LABEL: v_interp_f32: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: lds_param_load v0, attr0.y wait_vdst:15 +; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 +; GFX11-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.y wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GFX12-NEXT: export mrt0 v3, v2, v5, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -32,30 +54,55 @@ main_body: } define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f32_many: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15 -; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 -; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 -; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 -; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 -; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 -; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 -; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done -; GCN-NEXT: s_endpgm +; GFX11-LABEL: v_interp_f32_many: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: lds_param_load v0, attr0.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v2, attr2.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v3, attr3.x wait_vdst:15 +; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GFX11-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GFX11-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 +; GFX11-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GFX11-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v2, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -74,30 +121,55 @@ main_body: } define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f32_many_vm: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_mov_b32 s0, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15 -; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 -; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15 -; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 -; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 -; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 -; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done -; GCN-NEXT: s_endpgm +; GFX11-LABEL: v_interp_f32_many_vm: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GFX11-NEXT: s_mov_b32 m0, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: lds_param_load v2, attr0.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v3, attr1.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v4, attr2.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v5, attr3.x wait_vdst:15 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GFX11-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GFX11-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GFX11-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 +; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many_vm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GFX12-NEXT: s_mov_b32 m0, s0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: ds_param_load v2, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v4, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v5, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v0 done +; GFX12-NEXT: s_endpgm main_body: %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1 %i = load float, ptr addrspace(1) %i.ptr, align 4 @@ -120,23 +192,77 @@ main_body: } define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f16: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 -; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 -; GCN-NEXT: v_add_f16_e32 v0, v3, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_interp_f16: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_interp_f16: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: v_interp_f16: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: v_interp_f16: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0) @@ -148,23 +274,77 @@ main_body: } define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_rtz_f16: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 -; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 -; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 -; GCN-NEXT: v_add_f16_e32 v0, v3, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_interp_rtz_f16: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_interp_rtz_f16: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: v_interp_rtz_f16: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: v_interp_rtz_f16: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) @@ -176,17 +356,55 @@ main_body: } define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { -; GCN-LABEL: v_interp_f16_imm_params: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f16_e32 v0, v1, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_interp_f16_imm_params: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_interp_f16_imm_params: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 +; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: v_interp_f16_imm_params: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: v_interp_f16_imm_params: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 +; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0) %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll index 71a2d3e8a530..44b12a9f6fe8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s declare void @readsMem(ptr) #0 declare void @writesMem(ptr) #1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index 01287d5b7cf2..69abef02d3d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -27,19 +27,20 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dword s5, s[8:9], 0x10 -; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x10 +; GCN-NEXT: s_mov_b32 s6, s32 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s5, s5, 2 -; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_add_u32 s4, s6, s4 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s32, s6, 0x1000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -94,19 +95,20 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_load_dword s4, s[8:9], 0xc -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GCN-NEXT: s_add_u32 s5, s32, 0xfff +; GCN-NEXT: s_and_b32 s6, s5, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s4, s6, s4 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 s32, s6, 0x1000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -159,7 +161,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN-NEXT: s_and_b64 exec, exec, vcc ; GCN-NEXT: s_cbranch_execz .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_u32 s6, s32, 0x1000 +; GCN-NEXT: s_mov_b32 s6, s32 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -169,6 +171,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; GCN-NEXT: s_add_u32 s32, s6, 0x1000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off @@ -219,7 +222,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_add_u32 s6, s32, 0x1000 +; GCN-NEXT: s_add_u32 s6, s32, 0xfff ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v4, s6 @@ -230,6 +233,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; GCN-NEXT: s_add_u32 s32, s6, 0x1000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir index ed1ca320943d..5378ce2d1efa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir @@ -23,8 +23,11 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align1 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -32,8 +35,10 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 1 S_ENDPGM 0, implicit %1 @@ -57,8 +62,11 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align2 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -66,8 +74,10 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 2 S_ENDPGM 0, implicit %1 @@ -91,8 +101,11 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align4 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -100,8 +113,10 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 4 S_ENDPGM 0, implicit %1 @@ -125,8 +140,11 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align8 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -134,8 +152,10 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 8 S_ENDPGM 0, implicit %1 @@ -159,8 +179,11 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align16 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -168,8 +191,10 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 16 S_ENDPGM 0, implicit %1 @@ -193,10 +218,14 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 - ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047 + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 + ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align32 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -204,9 +233,12 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024 - ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1023 + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024 + ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 32 @@ -231,10 +263,14 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096 - ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096 + ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align64 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -242,9 +278,12 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 - ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047 + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 + ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 64 @@ -269,10 +308,14 @@ body: | ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -8192 - ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8191 + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -8192 + ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align128 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -280,9 +323,12 @@ body: | ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) - ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096 - ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096 + ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) %0:_(s32) = COPY $sgpr0 %1:_(p5) = G_DYN_STACKALLOC %0, 128 @@ -304,15 +350,20 @@ body: | ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align4 ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5) %0:_(s32) = G_CONSTANT i32 32 %1:_(p5) = G_DYN_STACKALLOC %0, 4 S_ENDPGM 0, implicit %1 @@ -336,8 +387,11 @@ body: | ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align8 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -345,8 +399,10 @@ body: | ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5) %0:_(s32) = G_CONSTANT i32 32 %1:_(p5) = G_DYN_STACKALLOC %0, 8 S_ENDPGM 0, implicit %1 @@ -370,8 +426,11 @@ body: | ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align16 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -379,8 +438,10 @@ body: | ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5) %0:_(s32) = G_CONSTANT i32 32 %1:_(p5) = G_DYN_STACKALLOC %0, 16 S_ENDPGM 0, implicit %1 @@ -404,10 +465,14 @@ body: | ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 - ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047 + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; WAVE64-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 + ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C3]](s32) + ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align32 ; WAVE32: liveins: $sgpr0 ; WAVE32-NEXT: {{ $}} @@ -415,9 +480,12 @@ body: | ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg - ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) - ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024 - ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1023 + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; WAVE32-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024 + ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C3]](s32) + ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) %0:_(s32) = G_CONSTANT i32 32 %1:_(p5) = G_DYN_STACKALLOC %0, 32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 168e6dfa5f14..e289ee759da1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -5105,30 +5105,30 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s5, s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_addc_u32 s8, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: s_addc_u32 s9, s3, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc @@ -5147,27 +5147,27 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s4 ; GFX8-NEXT: s_addc_u32 s5, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s8, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s9, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] -; GFX8-NEXT: s_and_b32 s0, 1, s2 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s9, 31 @@ -5194,27 +5194,27 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s4 ; GFX9-NEXT: s_addc_u32 s5, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s8, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_addc_u32 s9, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] -; GFX9-NEXT: s_and_b32 s0, 1, s2 +; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s9, 31 @@ -5895,30 +5895,30 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_saddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s8, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_addc_u32 s16, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_addc_u32 s17, s3, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s17, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc @@ -5928,30 +5928,30 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc ; GFX6-NEXT: s_add_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_addc_u32 s1, s5, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_addc_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc @@ -5974,27 +5974,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s8, s0, s8 ; GFX8-NEXT: s_addc_u32 s9, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s16, s2, s10 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s17, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: s_and_b32 s0, 1, s2 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s17, 31 @@ -6013,27 +6013,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc ; GFX8-NEXT: s_addc_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_addc_u32 s2, s6, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 @@ -6064,27 +6064,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s8, s0, s8 ; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s16, s2, s10 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_addc_u32 s17, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: s_and_b32 s0, 1, s2 +; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s17, 31 @@ -6103,27 +6103,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc ; GFX9-NEXT: s_addc_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_addc_u32 s2, s6, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 7214f4ab581d..43ebe156eb2a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -5111,22 +5111,23 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_subb_u32 s10, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_subb_u32 s11, s3, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -5136,7 +5137,6 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc @@ -5155,26 +5155,26 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s10, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] -; GFX8-NEXT: s_and_b32 s0, 1, s2 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -5204,26 +5204,26 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s10, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] -; GFX9-NEXT: s_and_b32 s0, 1, s2 +; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -5949,22 +5949,23 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_ssubsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s16, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s17, s1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_subb_u32 s18, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] ; GFX6-NEXT: s_subb_u32 s19, s3, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -5974,7 +5975,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc @@ -5984,22 +5984,23 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc ; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_subb_u32 s1, s5, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_subb_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_subb_u32 s3, s7, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -6009,7 +6010,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc @@ -6032,26 +6032,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s16, s0, s8 ; GFX8-NEXT: s_subb_u32 s17, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s18, s2, s10 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] -; GFX8-NEXT: s_and_b32 s0, 1, s2 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6073,26 +6073,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc ; GFX8-NEXT: s_subb_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_subb_u32 s2, s6, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6126,26 +6126,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s16, s0, s8 ; GFX9-NEXT: s_subb_u32 s17, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s18, s2, s10 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] -; GFX9-NEXT: s_and_b32 s0, 1, s2 +; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6167,26 +6167,26 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc ; GFX9-NEXT: s_subb_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_subb_u32 s2, s6, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 @@ -6300,15 +6300,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; ; GFX11-LABEL: s_ssubsat_v2i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_sub_u32 s16, s0, s8 -; GFX11-NEXT: s_subb_u32 s17, s1, s9 -; GFX11-NEXT: s_subb_u32 s18, s2, s10 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] -; GFX11-NEXT: s_subb_u32 s19, s3, s11 -; GFX11-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] +; GFX11-NEXT: s_sub_u32 s18, s0, s8 +; GFX11-NEXT: s_subb_u32 s19, s1, s9 +; GFX11-NEXT: s_subb_u32 s16, s2, s10 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] +; GFX11-NEXT: s_subb_u32 s17, s3, s11 +; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX11-NEXT: s_cselect_b32 s20, 1, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] ; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s20 @@ -6317,7 +6317,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_ashr_i32 s8, s19, 31 +; GFX11-NEXT: s_ashr_i32 s8, s17, 31 ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 @@ -6351,12 +6351,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s18 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, s17 -; GFX11-NEXT: v_mov_b32_e32 v2, s19 +; GFX11-NEXT: v_mov_b32_e32 v4, s19 +; GFX11-NEXT: v_mov_b32_e32 v2, s17 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 6bb4e2d3dbe2..ed85fb19d905 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -204,18 +204,37 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b) } define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { -; GCN-LABEL: vector_xnor_i32_one_use: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: vector_xnor_i32_one_use: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: vector_xnor_i32_one_use: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: vector_xnor_i32_one_use: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: vector_xnor_i32_one_use: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: vector_xnor_i32_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_xnor_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i32 %a, %b @@ -224,22 +243,45 @@ entry: } define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { -; GCN-LABEL: vector_xnor_i64_one_use: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: vector_xnor_i64_one_use: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: vector_xnor_i64_one_use: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: vector_xnor_i64_one_use: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX900-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: v_not_b32_e32 v1, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: vector_xnor_i64_one_use: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v2 +; GFX906-NEXT: v_xnor_b32_e32 v1, v1, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: vector_xnor_i64_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_not_b32_e32 v0, v0 -; GFX10-NEXT: v_not_b32_e32 v1, v1 +; GFX10-NEXT: v_xnor_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_xnor_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i64 %a, %b @@ -248,16 +290,32 @@ entry: } define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) { -; GCN-LABEL: xnor_s_v_i32_one_use: -; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX7-LABEL: xnor_s_v_i32_one_use: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: xnor_s_v_i32_one_use: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX900-LABEL: xnor_s_v_i32_one_use: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX906-LABEL: xnor_s_v_i32_one_use: +; GFX906: ; %bb.0: +; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0 +; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_s_v_i32_one_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_xnor_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %s, %v %d = xor i32 %xor, -1 @@ -266,16 +324,32 @@ define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) { } define amdgpu_ps float @xnor_v_s_i32_one_use(i32 inreg %s, i32 %v) { -; GCN-LABEL: xnor_v_s_i32_one_use: -; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX7-LABEL: xnor_v_s_i32_one_use: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: xnor_v_s_i32_one_use: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX900-LABEL: xnor_v_s_i32_one_use: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX906-LABEL: xnor_v_s_i32_one_use: +; GFX906: ; %bb.0: +; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0 +; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_v_s_i32_one_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_xnor_b32_e64 v0, v0, s0 ; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %v, %s %d = xor i32 %xor, -1 @@ -314,19 +388,15 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) { ; GFX906-LABEL: xnor_i64_s_v_one_use: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX906-NEXT: v_not_b32_e32 v0, v0 -; GFX906-NEXT: v_not_b32_e32 v1, v1 +; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0 +; GFX906-NEXT: v_xnor_b32_e32 v1, s1, v1 ; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_i64_s_v_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX10-NEXT: v_not_b32_e32 v0, v0 -; GFX10-NEXT: v_not_b32_e32 v1, v1 +; GFX10-NEXT: v_xnor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_xnor_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog entry: %b = shl i64 %b64, 29 @@ -367,19 +437,15 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) { ; GFX906-LABEL: xnor_i64_v_s_one_use: ; GFX906: ; %bb.0: ; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX906-NEXT: v_not_b32_e32 v0, v0 -; GFX906-NEXT: v_not_b32_e32 v1, v1 +; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0 +; GFX906-NEXT: v_xnor_b32_e64 v1, v1, s1 ; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_i64_v_s_one_use: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX10-NEXT: v_not_b32_e32 v0, v0 -; GFX10-NEXT: v_not_b32_e32 v1, v1 +; GFX10-NEXT: v_xnor_b32_e64 v0, v0, s0 +; GFX10-NEXT: v_xnor_b32_e64 v1, v1, s1 ; GFX10-NEXT: ; return to shader part epilog %b = shl i64 %b64, 29 %xor = xor i64 %b, %a @@ -419,7 +485,7 @@ define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) { ; GFX10-LABEL: vector_xor_na_b_i32_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor3_b32 v0, v0, -1, v1 +; GFX10-NEXT: v_xnor_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %na = xor i32 %a, -1 @@ -458,7 +524,7 @@ define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) { ; GFX10-LABEL: vector_xor_a_nb_i32_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor3_b32 v0, v1, -1, v0 +; GFX10-NEXT: v_xnor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %nb = xor i32 %b, -1 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index cff9ce050667..d316e1003775 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -217,7 +217,7 @@ define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 { ; AKF_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group -; ATTRIBUTOR_HSA-SAME: () #[[ATTR3:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) ; ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) @@ -233,9 +233,8 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/alloc-all-regs-reserved-in-class.mir b/llvm/test/CodeGen/AMDGPU/alloc-all-regs-reserved-in-class.mir index f1308a1608c5..d40fb7bde069 100644 --- a/llvm/test/CodeGen/AMDGPU/alloc-all-regs-reserved-in-class.mir +++ b/llvm/test/CodeGen/AMDGPU/alloc-all-regs-reserved-in-class.mir @@ -1,12 +1,10 @@ -# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=greedy -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=greedy -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck --implicit-check-not=error %s # Check that there isn't an assert if we try to allocate a virtual register from # a class where all registers are reserved. All AGPRs are reserved on subtargets # that do not have them. -# CHECK-NOT: ran out of registers during register allocation -# CHECK: LLVM ERROR: no registers from class available to allocate -# CHECK-NOT: ran out of registers during register allocation +# CHECK: error: <unknown>:0:0: no registers from class available to allocate in function 'use_agpr' --- name: use_agpr diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll index e5d440b96349..33e7e7a7a019 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -73,7 +73,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() { define void @func_uses_asm_virtreg_agpr() { ; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr( -; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: call void asm sideeffect " ; CHECK-NEXT: ret void ; @@ -83,7 +83,7 @@ define void @func_uses_asm_virtreg_agpr() { define void @func_uses_asm_physreg_agpr() { ; CHECK-LABEL: define void @func_uses_asm_physreg_agpr( -; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: call void asm sideeffect " ; CHECK-NEXT: ret void ; @@ -93,7 +93,7 @@ define void @func_uses_asm_physreg_agpr() { define void @func_uses_asm_physreg_agpr_tuple() { ; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple( -; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: call void asm sideeffect " ; CHECK-NEXT: ret void ; @@ -105,7 +105,7 @@ declare void @unknown() define amdgpu_kernel void @kernel_calls_extern() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern( -; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: ret void ; @@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() { define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite( -; CHECK-SAME: ) #[[ATTR4]] { -; CHECK-NEXT: call void @unknown() #[[ATTR9:[0-9]+]] +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]] ; CHECK-NEXT: ret void ; call void @unknown() #0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect( -; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] { +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: call void [[INDIRECT]]() ; CHECK-NEXT: ret void ; @@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) { ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite( -; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] { -; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR9]] +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]] ; CHECK-NEXT: ret void ; call void %indirect() #0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() { define void @empty() { ; CHECK-LABEL: define void @empty( -; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: ret void ; ret void @@ -163,7 +163,7 @@ define void @empty() { define void @also_empty() { ; CHECK-LABEL: define void @also_empty( -; CHECK-SAME: ) #[[ATTR5]] { +; CHECK-SAME: ) #[[ATTR1]] { ; CHECK-NEXT: ret void ; ret void @@ -254,14 +254,11 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { attributes #0 = { "amdgpu-no-agpr" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-agpr" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index b7436aeb1d53..4f04c15b3d44 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -9999,3 +9999,100 @@ define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) { %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024> ret <2 x i64> %result } + +define i64 @udiv_i64_gt_smax(i8 %size) { +; GFX6-LABEL: udiv_i64_gt_smax: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX6-NEXT: v_not_b32_e32 v1, v1 +; GFX6-NEXT: v_not_b32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xcccccccd +; GFX6-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, s4 +; GFX6-NEXT: s_mov_b32 s6, 0xcccccccc +; GFX6-NEXT: v_mul_hi_u32 v5, v1, s4 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s6 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 3, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: udiv_i64_gt_smax: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 31 +; GFX9-NEXT: v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: s_mov_b32 s4, 0xcccccccd +; GFX9-NEXT: v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mul_hi_u32 v0, v4, s4 +; GFX9-NEXT: v_not_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s6, 0xcccccccc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] +; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %esize = sext i8 %size to i64 + %minus = sub nuw nsw i64 -1, %esize + %div = udiv i64 %minus, 10 + ret i64 %div +} + +define i64 @udiv_i64_9divbits(i8 %size) { +; GFX6-LABEL: udiv_i64_9divbits: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s4, 0x41200000 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX6-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 0x1ff, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: udiv_i64_9divbits: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x41200000 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX9-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0x1ff, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %zextend = zext i8 %size to i64 + %num = add nuw nsw i64 1, %zextend + %div = udiv i64 %num, 10 + ret i64 %div +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll new file mode 100644 index 000000000000..906429212992 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s + +define i32 @use_grid_size_x_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_y_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4 +; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4 + %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4 + ret i32 %grid.size.y +} + +define i32 @use_grid_size_z_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8 +; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8 + %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4 + ret i32 %grid.size.z +} + +define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 { +; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret <2 x i16> [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4 + ret <2 x i16> %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_max() #2 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_zero() #3 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 + +attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" } +attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" } +attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" } +attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" } + +!0 = !{i32 0, i32 -1} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[RNG0]] = !{i32 1, i32 37} +; CHECK: [[RNG1]] = !{i32 1, i32 43} +; CHECK: [[RNG2]] = !{i32 1, i32 90} +; CHECK: [[RNG3]] = !{i32 1, i32 -1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-image-function-signatures.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-image-function-signatures.ll new file mode 100644 index 000000000000..c3bdf06b1447 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-image-function-signatures.ll @@ -0,0 +1,341 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s + +; Make sure we can produce a valid FunctionType for the expected +; signature of image functions. + +define i32 @test_get_image_width_ro_image1d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image1d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image1d_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image1d_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image1d_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image1d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image1d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image1d_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image1d_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image1d_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image1d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image1d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image1d_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image1d_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image1d_rw(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_ro_image1d_buffer_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image1d_buffer_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width21ocl_image1d_buffer_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width21ocl_image1d_buffer_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width21ocl_image1d_buffer_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image1d_buffer_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image1d_buffer_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width21ocl_image1d_buffer_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width21ocl_image1d_buffer_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width21ocl_image1d_buffer_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image1d_buffer_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image1d_buffer_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width21ocl_image1d_buffer_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width21ocl_image1d_buffer_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width21ocl_image1d_buffer_rw(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_ro_image2d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image2d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image2d_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image2d_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image2d_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image2d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image2d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image2d_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image2d_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image2d_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image2d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image2d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image2d_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image2d_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image2d_rw(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_ro_image3d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image3d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image3d_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image3d_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image3d_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image3d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image3d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image3d_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image3d_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image3d_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image3d_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image3d_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width14ocl_image3d_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width14ocl_image3d_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width14ocl_image3d_rw(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_ro_image1d_array_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image1d_array_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image1d_array_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image1d_array_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image1d_array_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image1d_array_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image1d_array_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image1d_array_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image1d_array_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image1d_array_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image1d_array_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image1d_array_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image1d_array_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image1d_array_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image1d_array_rw(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_ro_image2d_array_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image2d_array_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image2d_array_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image2d_array_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image2d_array_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image2d_array_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image2d_array_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image2d_array_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image2d_array_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image2d_array_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image2d_array_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image2d_array_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image2d_array_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image2d_array_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image2d_array_rw(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_ro_image2d_depth_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image2d_depth_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image2d_depth_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image2d_depth_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image2d_depth_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image2d_depth_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image2d_depth_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image2d_depth_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image2d_depth_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image2d_depth_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image2d_depth_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image2d_depth_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width20ocl_image2d_depth_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width20ocl_image2d_depth_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width20ocl_image2d_depth_rw(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_ro_image2d_array_depth_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_ro_image2d_array_depth_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width26ocl_image2d_array_depth_ro(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width26ocl_image2d_array_depth_ro(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width26ocl_image2d_array_depth_ro(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_wo_image2d_array_depth_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_wo_image2d_array_depth_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width26ocl_image2d_array_depth_wo(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width26ocl_image2d_array_depth_wo(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width26ocl_image2d_array_depth_wo(ptr addrspace(4)) #1 + +define i32 @test_get_image_width_rw_image2d_array_depth_t(ptr addrspace(4) readnone %img) { +; CHECK-LABEL: define i32 @test_get_image_width_rw_image2d_array_depth_t( +; CHECK-SAME: ptr addrspace(4) readnone [[IMG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z15get_image_width26ocl_image2d_array_depth_rw(ptr addrspace(4) [[IMG]]) +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = tail call i32 @_Z15get_image_width26ocl_image2d_array_depth_rw(ptr addrspace(4) %img) + ret i32 %call +} + +declare i32 @_Z15get_image_width26ocl_image2d_array_depth_rw(ptr addrspace(4)) #1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ldexp.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ldexp.ll index 24082b8c6661..dc275b33b012 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-ldexp.ll @@ -242,6 +242,47 @@ define float @test_ldexp_f32_strictfp(float %x, i32 %y) #4 { ret float %ldexp } +;--------------------------------------------------------------------- +; Invalid signatures +;--------------------------------------------------------------------- + +; Declared with wrong type, second argument is float +declare float @_Z5ldexpff(float noundef, float noundef) + +define float @call_wrong_typed_ldexp_f32_second_arg(float %x, float %wrongtype) { +; CHECK-LABEL: define float @call_wrong_typed_ldexp_f32_second_arg +; CHECK-SAME: (float [[X:%.*]], float [[WRONGTYPE:%.*]]) { +; CHECK-NEXT: [[CALL:%.*]] = call float @_Z5ldexpff(float [[X]], float [[WRONGTYPE]]) +; CHECK-NEXT: ret float [[CALL]] +; + %call = call float @_Z5ldexpff(float %x, float %wrongtype) + ret float %call +} + +declare <2 x float> @_Z5ldexpDv2_fS_(<2 x float>, <2 x float>) + +define <2 x float> @call_wrong_typed_ldexp_v2f32_second_arg(<2 x float> %x, <2 x float> %wrongtype) { +; CHECK-LABEL: define <2 x float> @call_wrong_typed_ldexp_v2f32_second_arg +; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x float> [[WRONGTYPE:%.*]]) { +; CHECK-NEXT: [[CALL:%.*]] = call <2 x float> @_Z5ldexpDv2_fS_(<2 x float> [[X]], <2 x float> [[WRONGTYPE]]) +; CHECK-NEXT: ret <2 x float> [[CALL]] +; + %call = call <2 x float> @_Z5ldexpDv2_fS_(<2 x float> %x, <2 x float> %wrongtype) + ret <2 x float> %call +} + +declare <2 x float> @_Z5ldexpDv2_ff(<2 x float>, float) + +define <2 x float> @call_wrong_typed_ldexp_v2f32_f32(<2 x float> %x, float %wrongtype) { +; CHECK-LABEL: define <2 x float> @call_wrong_typed_ldexp_v2f32_f32 +; CHECK-SAME: (<2 x float> [[X:%.*]], float [[WRONGTYPE:%.*]]) { +; CHECK-NEXT: [[CALL:%.*]] = call <2 x float> @_Z5ldexpDv2_ff(<2 x float> [[X]], float [[WRONGTYPE]]) +; CHECK-NEXT: ret <2 x float> [[CALL]] +; + %call = call <2 x float> @_Z5ldexpDv2_ff(<2 x float> %x, float %wrongtype) + ret <2 x float> %call +} + attributes #0 = { nobuiltin } attributes #1 = { "no-builtins" } attributes #2 = { nounwind memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 9d0d85da9f7f..25b6b7be1f3b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -82,10 +82,10 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_log_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -98,10 +98,10 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 ; CHECK-NEXT: v_exp_f32_e32 v2, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_not_b32_e32 v3, 63 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_ldexp_f32 v2, v2, v3 ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to float @@ -228,9 +228,9 @@ define float @test_powr_fast_f32(float %x, float %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_ldexp_f32 v0, v0, v3 ; CHECK-NEXT: v_log_f32_e32 v0, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -242,9 +242,9 @@ define float @test_powr_fast_f32(float %x, float %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; CHECK-NEXT: v_fma_f32 v0, v1, v0, v2 ; CHECK-NEXT: v_exp_f32_e32 v0, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 +; CHECK-NEXT: v_not_b32_e32 v1, 63 +; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; CHECK-NEXT: v_ldexp_f32 v0, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %powr = tail call fast float @_Z4powrff(float %x, float %y) ret float %powr @@ -368,9 +368,9 @@ define float @test_pown_fast_f32(float %x, i32 %y) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_log_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -383,10 +383,10 @@ define float @test_pown_fast_f32(float %x, i32 %y) { ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 ; CHECK-NEXT: v_exp_f32_e32 v2, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; CHECK-NEXT: v_not_b32_e32 v3, 63 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_ldexp_f32 v2, v2, v3 ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %call = tail call fast float @_Z4pownfi(float %x, i32 %y) @@ -511,9 +511,9 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v0, |v0|, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_ldexp_f32 v0, |v0|, v3 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_log_f32_e32 v0, v0 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 @@ -527,9 +527,9 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) { ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2 ; CHECK-NEXT: v_exp_f32_e32 v0, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 +; CHECK-NEXT: v_not_b32_e32 v1, 63 +; CHECK-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; CHECK-NEXT: v_ldexp_f32 v0, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = shl i32 %y.arg, 1 %call = tail call fast float @_Z4pownfi(float %x, i32 %y) @@ -651,9 +651,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_or_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_log_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 @@ -667,10 +667,10 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) { ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; CHECK-NEXT: v_fma_f32 v1, v2, v1, v3 ; CHECK-NEXT: v_exp_f32_e32 v1, v1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x1f800000 -; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; CHECK-NEXT: v_not_b32_e32 v2, 63 +; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CHECK-NEXT: s_brev_b32 s4, -2 -; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_ldexp_f32 v1, v1, v2 ; CHECK-NEXT: v_bfi_b32 v0, s4, v1, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-unexpected-types.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-unexpected-types.ll new file mode 100644 index 000000000000..1cdc06ea942c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-unexpected-types.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib -amdgpu-prelink %s | FileCheck %s +; Make sure there are no crashes on unexpected types + +%struct.vfloat3 = type { float, float, float } + +declare hidden %struct.vfloat3 @_Z3mix7vfloat3S_f(float, float, float, float, float, float, float) + +define %struct.vfloat3 @_Z8test_mix7vfloat3S_f(float %x.coerce0, float %x.coerce1, float %x.coerce2, float %y.coerce0, float %y.coerce1, float %y.coerce2, float %t) { +; CHECK-LABEL: define %struct.vfloat3 @_Z8test_mix7vfloat3S_f( +; CHECK-SAME: float [[X_COERCE0:%.*]], float [[X_COERCE1:%.*]], float [[X_COERCE2:%.*]], float [[Y_COERCE0:%.*]], float [[Y_COERCE1:%.*]], float [[Y_COERCE2:%.*]], float [[T:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = call [[STRUCT_VFLOAT3:%.*]] @[[_Z3MIX7VFLOAT3S_F:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](float [[X_COERCE0]], float [[X_COERCE1]], float [[X_COERCE2]], float [[Y_COERCE0]], float [[Y_COERCE1]], float [[Y_COERCE2]], float [[T]]) +; CHECK-NEXT: ret [[STRUCT_VFLOAT3]] [[CALL]] +; +entry: + %call = call %struct.vfloat3 @_Z3mix7vfloat3S_f(float %x.coerce0, float %x.coerce1, float %x.coerce2, float %y.coerce0, float %y.coerce1, float %y.coerce2, float %t) + ret %struct.vfloat3 %call +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll new file mode 100644 index 000000000000..b9fa89dd6f0a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. +; Also checks if amdgpu-no-lds-kernel-id attribute is removed from the list of attributes +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 1 +@lds_2 = internal addrspace(3) global [1 x i32] poison, align 2 +@lds_3 = external addrspace(3) global [3 x i8], align 4 +@lds_4 = external addrspace(3) global [4 x i8], align 8 + +;. +; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 96, i32 3, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 128, i32 4, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [1 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.k0], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [1 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 3, i32 0), ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.k0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0)]], no_sanitize_address +;. +define void @use_variables() sanitize_address { +; CHECK-LABEL: define void @use_variables( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) [[TMP10]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP11]] +; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: store i8 3, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP12]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] +; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP15]], align 8 +; CHECK-NEXT: ret void +; + %X = addrspacecast ptr addrspace(3) @lds_3 to ptr + store i8 3, ptr addrspacecast( ptr addrspace(3) @lds_3 to ptr), align 4 + store i8 3, ptr addrspace(3) @lds_4, align 8 + ret void +} + +define amdgpu_kernel void @k0() sanitize_address #1 { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 31) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 99 +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 29) +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 132 +; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 28) +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP28]] +; CHECK-NEXT: call void @use_variables() +; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint ptr addrspace(3) [[TMP27]] to i32 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP25]], i32 [[TMP30]] +; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr addrspace(3) [[TMP29]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP25]], i32 [[TMP32]] +; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP33]], align 2 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP34:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr [[TMP34]] to i64 +; CHECK-NEXT: [[TMP36:%.*]] = ptrtoint ptr addrspace(1) [[TMP25]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP36]], i64 [[TMP35]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + call void @use_variables() + store i8 7, ptr addrspace(3) @lds_1, align 1 + store i32 8, ptr addrspace(3) @lds_2, align 2 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +attributes #1 = { "amdgpu-no-lds-kernel-id" } +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +; CHECK: [[META2]] = !{i32 0} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll new file mode 100644 index 000000000000..73ffcdd783de --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-no-heap-ptr.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if static LDS accesses in kernel are lowered correctly. Also checks if amdgpu-no-heap-ptr attribute +; is removed from the list of attributes +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 +@lds_2 = internal addrspace(3) global [1 x i32] poison, align 8 + +;. +; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 1, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address +;. +define amdgpu_kernel void @k0() sanitize_address #1 { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB20:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 33 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 31) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: br label %[[BB20]] +; CHECK: [[BB20]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(3) [[TMP23]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(1) [[TMP27]] to i64 +; CHECK-NEXT: [[TMP29:%.*]] = lshr i64 [[TMP28]], 3 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP29]], 2147450880 +; CHECK-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = icmp ne i8 [[TMP32]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = and i64 [[TMP28]], 7 +; CHECK-NEXT: [[TMP35:%.*]] = trunc i64 [[TMP34]] to i8 +; CHECK-NEXT: [[TMP36:%.*]] = icmp sge i8 [[TMP35]], [[TMP32]] +; CHECK-NEXT: [[TMP37:%.*]] = and i1 [[TMP33]], [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP37]]) +; CHECK-NEXT: [[TMP39:%.*]] = icmp ne i64 [[TMP38]], 0 +; CHECK-NEXT: br i1 [[TMP39]], label %[[ASAN_REPORT:.*]], label %[[BB42:.*]], !prof [[PROF2:![0-9]+]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP37]], label %[[BB40:.*]], label %[[BB41:.*]] +; CHECK: [[BB40]]: +; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP28]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: call void @llvm.amdgcn.unreachable() +; CHECK-NEXT: br label %[[BB41]] +; CHECK: [[BB41]]: +; CHECK-NEXT: br label %[[BB42]] +; CHECK: [[BB42]]: +; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP27]], align 4 +; CHECK-NEXT: [[TMP43:%.*]] = ptrtoint ptr addrspace(3) [[TMP25]] to i32 +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i32 [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = ptrtoint ptr addrspace(1) [[TMP44]] to i64 +; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], 3 +; CHECK-NEXT: [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP48:%.*]] = ptrtoint ptr addrspace(1) [[TMP44]] to i64 +; CHECK-NEXT: [[TMP49:%.*]] = lshr i64 [[TMP48]], 3 +; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[TMP49]], 2147450880 +; CHECK-NEXT: [[TMP51:%.*]] = inttoptr i64 [[TMP50]] to ptr +; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr [[TMP51]], align 1 +; CHECK-NEXT: [[TMP53:%.*]] = icmp ne i8 [[TMP52]], 0 +; CHECK-NEXT: [[TMP54:%.*]] = and i64 [[TMP48]], 7 +; CHECK-NEXT: [[TMP55:%.*]] = trunc i64 [[TMP54]] to i8 +; CHECK-NEXT: [[TMP56:%.*]] = icmp sge i8 [[TMP55]], [[TMP52]] +; CHECK-NEXT: [[TMP57:%.*]] = and i1 [[TMP53]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP57]]) +; CHECK-NEXT: [[TMP59:%.*]] = icmp ne i64 [[TMP58]], 0 +; CHECK-NEXT: br i1 [[TMP59]], label %[[ASAN_REPORT1:.*]], label %[[BB62:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT1]]: +; CHECK-NEXT: br i1 [[TMP57]], label %[[BB60:.*]], label %[[BB61:.*]] +; CHECK: [[BB60]]: +; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP48]]) #[[ATTR6]] +; CHECK-NEXT: call void @llvm.amdgcn.unreachable() +; CHECK-NEXT: br label %[[BB61]] +; CHECK: [[BB61]]: +; CHECK-NEXT: br label %[[BB62]] +; CHECK: [[BB62]]: +; CHECK-NEXT: [[TMP63:%.*]] = ptrtoint ptr addrspace(1) [[TMP47]] to i64 +; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP63]], 3 +; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[TMP64]], 2147450880 +; CHECK-NEXT: [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr +; CHECK-NEXT: [[TMP67:%.*]] = load i8, ptr [[TMP66]], align 1 +; CHECK-NEXT: [[TMP68:%.*]] = icmp ne i8 [[TMP67]], 0 +; CHECK-NEXT: [[TMP69:%.*]] = and i64 [[TMP63]], 7 +; CHECK-NEXT: [[TMP70:%.*]] = trunc i64 [[TMP69]] to i8 +; CHECK-NEXT: [[TMP71:%.*]] = icmp sge i8 [[TMP70]], [[TMP67]] +; CHECK-NEXT: [[TMP72:%.*]] = and i1 [[TMP68]], [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP72]]) +; CHECK-NEXT: [[TMP74:%.*]] = icmp ne i64 [[TMP73]], 0 +; CHECK-NEXT: br i1 [[TMP74]], label %[[ASAN_REPORT2:.*]], label %[[BB77:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT2]]: +; CHECK-NEXT: br i1 [[TMP72]], label %[[BB75:.*]], label %[[BB76:.*]] +; CHECK: [[BB75]]: +; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP63]]) #[[ATTR6]] +; CHECK-NEXT: call void @llvm.amdgcn.unreachable() +; CHECK-NEXT: br label %[[BB76]] +; CHECK: [[BB76]]: +; CHECK-NEXT: br label %[[BB77]] +; CHECK: [[BB77]]: +; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP44]], align 2 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP78:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP79:%.*]] = ptrtoint ptr [[TMP78]] to i64 +; CHECK-NEXT: [[TMP80:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP80]], i64 [[TMP79]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + store i8 7, ptr addrspace(3) @lds_1, align 4 + store i32 8, ptr addrspace(3) @lds_2, align 2 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +attributes #1 = { "amdgpu-no-heap-ptr" } +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent nocallback nofree nounwind } +; CHECK: attributes #[[ATTR6]] = { nomerge } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index c1a957dec3e8..82832277b1ab 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -504,7 +504,7 @@ define amdgpu_kernel void @v2float_stack(ptr addrspace(1) %out, i32 %a) { } ; OPT-LABEL: @direct_alloca_read_0xi32( -; OPT: store [0 x i32] undef, ptr addrspace(3) +; OPT: store [0 x i32] poison, ptr addrspace(3) ; OPT: load [0 x i32], ptr addrspace(3) define amdgpu_kernel void @direct_alloca_read_0xi32(ptr addrspace(1) %out, i32 %index) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll index 1a0fda3d54d3..a5f915c48ebe 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -143,7 +143,7 @@ attributes #0 = { nounwind } ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: ; SDAG-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}} -; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01cb{{$}} +; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}} ; GCN-NEXT: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: @@ -157,10 +157,10 @@ attributes #0 = { nounwind } ; GCN-NEXT: .backend_stack_size: 0x10{{$}} ; GCN-NEXT: .lds_size: 0{{$}} ; SDAG-NEXT: .sgpr_count: 0x25{{$}} -; GISEL-NEXT: .sgpr_count: 0x27{{$}} +; GISEL-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x3{{$}} -; GISEL-NEXT: .vgpr_count: 0x5{{$}} +; GISEL-NEXT: .vgpr_count: 0x4{{$}} ; GCN-NEXT: multiple_stack: ; GCN-NEXT: .backend_stack_size: 0x24{{$}} ; GCN-NEXT: .lds_size: 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll index 28722021e044..7e0208cd1f45 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll @@ -117,14 +117,14 @@ define void @call_no_dispatch_id() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 3d4ae84d9c69..ea3f08ede2c5 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr -; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; ATTRIBUTOR_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -645,7 +645,7 @@ define internal void @defined.func() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -658,7 +658,7 @@ define void @func_call_external() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -673,7 +673,7 @@ define void @func_call_defined() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -687,8 +687,8 @@ define void @func_call_asm() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { -; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]] +; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { +; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR26:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void asm sideeffect "", ""() #3 @@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -788,7 +788,7 @@ define float @func_extern_call() #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call -; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call -; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] { +; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 { ; AKF_HSA-NEXT: ret float [[FADD]] ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call -; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] { +; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) ; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] @@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address -; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -956,7 +956,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def() ; ATTRIBUTOR_HSA-NEXT: ret void ; @@ -969,7 +969,7 @@ define void @unused_enqueue_block() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -980,7 +980,7 @@ define internal void @known_func() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR25]] { ; ATTRIBUTOR_HSA-NEXT: ret void ; ret void @@ -994,8 +994,8 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block -; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] { -; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR29:[0-9]+]] +; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] { +; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR27:[0-9]+]] ; ATTRIBUTOR_HSA-NEXT: ret void ; call void @known_func() #6 @@ -1025,35 +1025,33 @@ attributes #6 = { "enqueued-block" } ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind } -; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR21:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { nounwind } +; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "enqueued-block" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 43cdf85ed381..6896ac8d2e5d 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -442,7 +442,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast -; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] { +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -621,7 +621,7 @@ define void @use_alloca_func() #1 { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; ATTRIBUTOR_HSA-SAME: () #[[ATTR13:[0-9]+]] { +; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; ATTRIBUTOR_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void @@ -643,19 +643,19 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index 547ff69592ca..89fe46d97530 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -398,13 +398,13 @@ attributes #1 = { nounwind } ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll new file mode 100644 index 000000000000..678c3a0158ec --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s + +; External call to avoid inferring argument attributes. This makes the +; final attribute groups easier to read +declare void @dummy() + +define void @extern_callee() { +; CHECK-LABEL: define void @extern_callee( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define internal void @callee_1_2_3() { +; CHECK-LABEL: define internal void @callee_1_2_3( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_1_2_3() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2_3( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: call void @callee_1_2_3() +; CHECK-NEXT: call void @extern_callee() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @callee_1_2_3() + call void @extern_callee() + call void @dummy() + ret void +} + +attributes #0 = {"amdgpu-max-num-workgroups"="1,2,3"} + +; -> 100,10,99 +define internal void @callee_merge_100_8_32__16_10_99() { +; CHECK-LABEL: define internal void @callee_merge_100_8_32__16_10_99( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_100_8_32() #1 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_100_8_32( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + ret void +} + +define amdgpu_cs void @amdgpu_cs_100_8_32() #1 { +; CHECK-LABEL: define amdgpu_cs void @amdgpu_cs_100_8_32( +; CHECK-SAME: ) #[[ATTR4]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + ret void +} + +attributes #1 = {"amdgpu-max-num-workgroups"="100,8,32"} + +define amdgpu_kernel void @kernel_16_10_99() #2 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_16_10_99( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: call void @callee_merge_100_8_32__16_10_99() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @callee_merge_100_8_32__16_10_99() + call void @dummy() + ret void +} + +attributes #2 = {"amdgpu-max-num-workgroups"="16,10,99"} + +define internal void @merge_to_worst_case() { +; CHECK-LABEL: define internal void @merge_to_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define internal void @callee_x_worst_case() { +; CHECK-LABEL: define internal void @callee_x_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_x_maximum() #3 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_x_maximum( +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @callee_x_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @callee_x_worst_case() + call void @dummy() + ret void +} + +attributes #3 = {"amdgpu-max-num-workgroups"="4294967295,1,1"} + +define amdgpu_kernel void @kernel_y_maximum() #4 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_y_maximum( +; CHECK-SAME: ) #[[ATTR7:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @dummy() + ret void +} + +attributes #4 = {"amdgpu-max-num-workgroups"="1,4294967295,1"} + +define amdgpu_kernel void @kernel_z_maximum() #5 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_z_maximum( +; CHECK-SAME: ) #[[ATTR8:[0-9]+]] { +; CHECK-NEXT: call void @merge_to_worst_case() +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @merge_to_worst_case() + call void @dummy() + ret void +} + +attributes #5 = {"amdgpu-max-num-workgroups"="1,1,4294967295"} + +; Make sure the attribute isn't lost from the callee. +define internal void @annotated_callee_from_unannotated_kernel() #6 { +; CHECK-LABEL: define internal void @annotated_callee_from_unannotated_kernel( +; CHECK-SAME: ) #[[ATTR9:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +attributes #6 = {"amdgpu-max-num-workgroups"="42,99,123"} + +define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee() { +; CHECK-LABEL: define amdgpu_kernel void @unannotated_kernel_calls_annotated_callee( +; CHECK-SAME: ) #[[ATTR10:[0-9]+]] { +; CHECK-NEXT: call void @annotated_callee_from_unannotated_kernel() +; CHECK-NEXT: ret void +; + call void @annotated_callee_from_unannotated_kernel() + ret void +} + + +define internal void @annotated_callee_merge_caller() #7 { +; CHECK-LABEL: define internal void @annotated_callee_merge_caller( +; CHECK-SAME: ) #[[ATTR11:[0-9]+]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +attributes #7 = {"amdgpu-max-num-workgroups"="512,256,1024"} + +define amdgpu_kernel void @call_annotated_callee_merge_caller() #8 { +; CHECK-LABEL: define amdgpu_kernel void @call_annotated_callee_merge_caller( +; CHECK-SAME: ) #[[ATTR12:[0-9]+]] { +; CHECK-NEXT: call void @annotated_callee_merge_caller() +; CHECK-NEXT: ret void +; + call void @annotated_callee_merge_caller() + ret void +} + +attributes #8 = {"amdgpu-max-num-workgroups"="256,128,2048"} + +define internal void @called_by_explicit_worst_case() { +; CHECK-LABEL: define internal void @called_by_explicit_worst_case( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: ret void +; + call void @dummy() + ret void +} + +define amdgpu_kernel void @kernel_explicit_worst_case() #9 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_explicit_worst_case( +; CHECK-SAME: ) #[[ATTR13:[0-9]+]] { +; CHECK-NEXT: call void @called_by_explicit_worst_case() +; CHECK-NEXT: ret void +; + call void @called_by_explicit_worst_case() + ret void +} + +attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"} + +;. +; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,10,99" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="100,8,32" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="42,99,123" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="256,128,1024" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR12]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR13]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll new file mode 100644 index 000000000000..682a57571d11 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -0,0 +1,570 @@ +; Test the generation of the attribute amdgpu-no-flat-scratch-init +; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator | FileCheck -check-prefixes=GFX10 %s + +;; tests of addrspacecast + +define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of indirect call, intrinsics + +@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 + +define void @with_indirect_call() { + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define amdgpu_kernel void @with_indirect_call_cc_kernel() { + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define void @call_with_indirect_call() { + call void @with_indirect_call() + ret void +} + +define amdgpu_kernel void @call_with_indirect_call_cc_kernel() { + call void @with_indirect_call() + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() + +define void @use_intrinsic_workitem_id_x() { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) null + ret void +} + +define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) null + ret void +} + +define void @call_use_intrinsic_workitem_id_x() { + call void @use_intrinsic_workitem_id_x() + ret void +} + +define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { + call void @use_intrinsic_workitem_id_x() + ret void +} + +; GFX10: name: without_global_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_global_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_global_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_global_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } +; +; GFX10: name: without_region_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_region_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_region_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_region_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: without_group_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_group_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_group_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_group_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: without_constant_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_constant_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_constant_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_constant_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_call_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_call_with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; +; GFX10: name: with_indirect_call +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: with_indirect_call_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr8_sgpr9' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr12_sgpr13' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr14' } +; +; GFX10: name: call_with_indirect_call +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_with_indirect_call_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr8_sgpr9' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr12_sgpr13' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr14' } + +; +; GFX10: name: use_intrinsic_workitem_id_x +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: use_intrinsic_workitem_id_x_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } +; +; GFX10: name: call_use_intrinsic_workitem_id_x +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; GFX10-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr12' } +; +; GFX10: name: call_use_intrinsic_workitem_id_x_cc_kernel +; GFX10: argumentInfo: +; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr4' } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll new file mode 100644 index 000000000000..25da00e6bde3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll @@ -0,0 +1,896 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; Test the generation of the attribute amdgpu-no-flat-scratch-init +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s + +;; tests of addrspacecast + +;. +; GFX9: @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 +;. +; GFX10: @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 +;. +define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define void @without_global_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @without_global_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(1) %ptr + ret void +} + +define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define void @with_global_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_global_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(1) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define void @without_region_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(2) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @without_region_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(2) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(2) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(2) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(2) %ptr + ret void +} + +define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define void @with_region_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_region_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(2) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(2) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define void @without_group_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(3) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @without_group_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(3) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(3) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(3) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(3) %ptr + ret void +} + +define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define void @with_group_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_group_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(3) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(4) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(4) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(4) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(4) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(4) %ptr + ret void +} + +define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(4) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(4) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @without_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @without_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4 +; GFX10-NEXT: ret void +; + store volatile i32 0, ptr addrspace(5) %ptr + ret void +} + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @with_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: call void @without_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX9-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel( +; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr +; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; GFX10-NEXT: call void @with_private_to_flat_addrspacecast(ptr addrspace(5) [[PTR]]) +; GFX10-NEXT: ret void +; + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of addrspacecast in a constant + +define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) { +; GFX9-LABEL: define amdgpu_kernel void @private_constant_expression_use( +; GFX9-SAME: ptr addrspace(1) nocapture [[OUT:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) [[OUT]], align 8 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @private_constant_expression_use( +; GFX10-SAME: ptr addrspace(1) nocapture [[OUT:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) [[OUT]], align 8 +; GFX10-NEXT: ret void +; + store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 + ret void +} + +;; tests of indirect call, intrinsics, inline asm + +@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4 + +define void @with_indirect_call() { +; GFX9-LABEL: define void @with_indirect_call( +; GFX9-SAME: ) #[[ATTR2:[0-9]+]] { +; GFX9-NEXT: [[FPTR:%.*]] = load ptr, ptr addrspace(4) @gv.fptr0, align 8 +; GFX9-NEXT: call void [[FPTR]]() +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @with_indirect_call( +; GFX10-SAME: ) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[FPTR:%.*]] = load ptr, ptr addrspace(4) @gv.fptr0, align 8 +; GFX10-NEXT: call void [[FPTR]]() +; GFX10-NEXT: ret void +; + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define amdgpu_kernel void @with_indirect_call_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel( +; GFX9-SAME: ) #[[ATTR2]] { +; GFX9-NEXT: [[FPTR:%.*]] = load ptr, ptr addrspace(4) @gv.fptr0, align 8 +; GFX9-NEXT: call void [[FPTR]]() +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel( +; GFX10-SAME: ) #[[ATTR2]] { +; GFX10-NEXT: [[FPTR:%.*]] = load ptr, ptr addrspace(4) @gv.fptr0, align 8 +; GFX10-NEXT: call void [[FPTR]]() +; GFX10-NEXT: ret void +; + %fptr = load ptr, ptr addrspace(4) @gv.fptr0 + call void %fptr() + ret void +} + +define void @call_with_indirect_call() { +; GFX9-LABEL: define void @call_with_indirect_call( +; GFX9-SAME: ) #[[ATTR2]] { +; GFX9-NEXT: call void @with_indirect_call() +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_with_indirect_call( +; GFX10-SAME: ) #[[ATTR2]] { +; GFX10-NEXT: call void @with_indirect_call() +; GFX10-NEXT: ret void +; + call void @with_indirect_call() + ret void +} + +define amdgpu_kernel void @call_with_indirect_call_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel( +; GFX9-SAME: ) #[[ATTR2]] { +; GFX9-NEXT: call void @with_indirect_call() +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel( +; GFX10-SAME: ) #[[ATTR2]] { +; GFX10-NEXT: call void @with_indirect_call() +; GFX10-NEXT: ret void +; + call void @with_indirect_call() + ret void +} + +define void @empty() { +; GFX9-LABEL: define void @empty( +; GFX9-SAME: ) #[[ATTR0]] { +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @empty( +; GFX10-SAME: ) #[[ATTR0]] { +; GFX10-NEXT: ret void +; + ret void +} + +define void @also_empty() { +; GFX9-LABEL: define void @also_empty( +; GFX9-SAME: ) #[[ATTR0]] { +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @also_empty( +; GFX10-SAME: ) #[[ATTR0]] { +; GFX10-NEXT: ret void +; + ret void +} + +define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) { +; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees( +; GFX9-SAME: i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; GFX9-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty +; GFX9-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty +; GFX9-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; GFX9: [[BB2]]: +; GFX9-NEXT: call void @also_empty() +; GFX9-NEXT: br label %[[BB6:.*]] +; GFX9: [[BB3]]: +; GFX9-NEXT: br i1 true, label %[[BB4:.*]], label %[[BB5:.*]] +; GFX9: [[BB4]]: +; GFX9-NEXT: call void @empty() +; GFX9-NEXT: br label %[[BB6]] +; GFX9: [[BB5]]: +; GFX9-NEXT: unreachable +; GFX9: [[BB6]]: +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees( +; GFX10-SAME: i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; GFX10-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty +; GFX10-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty +; GFX10-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]] +; GFX10: [[BB2]]: +; GFX10-NEXT: call void @also_empty() +; GFX10-NEXT: br label %[[BB6:.*]] +; GFX10: [[BB3]]: +; GFX10-NEXT: br i1 true, label %[[BB4:.*]], label %[[BB5:.*]] +; GFX10: [[BB4]]: +; GFX10-NEXT: call void @empty() +; GFX10-NEXT: br label %[[BB6]] +; GFX10: [[BB5]]: +; GFX10-NEXT: unreachable +; GFX10: [[BB6]]: +; GFX10-NEXT: ret void +; + %fptr = select i1 %cond, ptr @empty, ptr @also_empty + call void %fptr() + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() + +define void @use_intrinsic_workitem_id_x() { +; GFX9-LABEL: define void @use_intrinsic_workitem_id_x( +; GFX9-SAME: ) #[[ATTR5:[0-9]+]] { +; GFX9-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) null, align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @use_intrinsic_workitem_id_x( +; GFX10-SAME: ) #[[ATTR5:[0-9]+]] { +; GFX10-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX10-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) null, align 4 +; GFX10-NEXT: ret void +; + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) null + ret void +} + +define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel( +; GFX9-SAME: ) #[[ATTR0]] { +; GFX9-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) null, align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel( +; GFX10-SAME: ) #[[ATTR0]] { +; GFX10-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; GFX10-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) null, align 4 +; GFX10-NEXT: ret void +; + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, ptr addrspace(1) null + ret void +} + +define void @call_use_intrinsic_workitem_id_x() { +; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x( +; GFX9-SAME: ) #[[ATTR5]] { +; GFX9-NEXT: call void @use_intrinsic_workitem_id_x() +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x( +; GFX10-SAME: ) #[[ATTR5]] { +; GFX10-NEXT: call void @use_intrinsic_workitem_id_x() +; GFX10-NEXT: ret void +; + call void @use_intrinsic_workitem_id_x() + ret void +} + +define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { +; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel( +; GFX9-SAME: ) #[[ATTR5]] { +; GFX9-NEXT: call void @use_intrinsic_workitem_id_x() +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel( +; GFX10-SAME: ) #[[ATTR5]] { +; GFX10-NEXT: call void @use_intrinsic_workitem_id_x() +; GFX10-NEXT: ret void +; + call void @use_intrinsic_workitem_id_x() + ret void +} + +define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]]) +; GFX9-NEXT: store volatile i32 7, ptr [[TMP1]], align 4 +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]]) +; GFX10-NEXT: store volatile i32 7, ptr [[TMP1]], align 4 +; GFX10-NEXT: ret void +; + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) { +; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel( +; GFX9-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { +; GFX9-NEXT: call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) [[PTR]]) +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel( +; GFX10-SAME: ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) [[PTR]]) +; GFX10-NEXT: ret void +; + call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) + ret void +} + +define amdgpu_kernel void @with_inline_asm() { +; GFX9-LABEL: define amdgpu_kernel void @with_inline_asm( +; GFX9-SAME: ) #[[ATTR3]] { +; GFX9-NEXT: call void asm sideeffect " +; GFX9-NEXT: ret void +; +; GFX10-LABEL: define amdgpu_kernel void @with_inline_asm( +; GFX10-SAME: ) #[[ATTR3]] { +; GFX10-NEXT: call void asm sideeffect " +; GFX10-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(i32 poison) + ret void +} + +;. +; GFX9: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX9: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" } +; GFX9: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +;. +; GFX10: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +; GFX10: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" } +; GFX10: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll index 2b9f579e6a18..8481cea4d7c3 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll @@ -51,7 +51,7 @@ bb5: ; preds = %bb5, %bb3 define amdgpu_kernel void @entry() { ; CHECK-LABEL: define {{[^@]+}}@entry -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [[TMP0:%.*]], align 8, addrspace(5) ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr ; CHECK-NEXT: [[ARST:%.*]] = call double @baz(ptr [[CAST]]) @@ -63,5 +63,6 @@ define amdgpu_kernel void @entry() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir b/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir index 460f6d24b9b1..dfe4b8a33f39 100644 --- a/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir +++ b/llvm/test/CodeGen/AMDGPU/av-spill-expansion-with-machine-cp.mir @@ -3,7 +3,6 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - -run-pass prologepilog,machine-cp -verify-machineinstrs | FileCheck -check-prefix=GFX908-PEI-MACHINECP %s # When VGPRs are available for spilling, prologepilog marks the tuple implicit-def as well as implicit in the first spill instruction. -# As a consequence, machine-cp would NOT delete agpr2 copy here. --- name: agpr-spill-to-vgpr-machine-cp @@ -11,6 +10,7 @@ tracksRegLiveness: true stack: - { id: 0, name: '', type: spill-slot, offset: 0, size: 128, alignment: 4 } machineFunctionInfo: + isEntryFunction: true scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: '$sgpr32' hasSpilledVGPRs: true @@ -43,8 +43,8 @@ body: | S_ENDPGM 0 ... -# When VGPRs are NOT available for spilling (stack is used), prologepilog marks the tuple implicit-def only and NOT implicit. -# As a consequence, machine-cp would delete agpr2 copy here. Presently, this is incorrect behavior. +# When VGPRs are NOT available for spilling (stack is used), prologepilog should also mark the tuple implicit-def and implicit (similar to above usecase). +# As a consequence, machine-cp would not delete agpr2 copy here. --- name: agpr-spill-to-vgpr-to-stack-machine-cp @@ -52,6 +52,7 @@ tracksRegLiveness: true stack: - { id: 0, name: '', type: spill-slot, offset: 0, size: 128, alignment: 4 } machineFunctionInfo: + isEntryFunction: true scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: '$sgpr32' hasSpilledVGPRs: true @@ -60,29 +61,34 @@ body: | successors: liveins: $vgpr0, $vgpr1 ; GFX908-PEI-LABEL: name: agpr-spill-to-vgpr-to-stack-machine-cp - ; GFX908-PEI: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55 + ; GFX908-PEI: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-PEI-NEXT: {{ $}} + ; GFX908-PEI-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-PEI-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-PEI-NEXT: renamable $agpr0 = COPY renamable $vgpr0, implicit $exec ; GFX908-PEI-NEXT: renamable $agpr2 = COPY renamable $vgpr1, implicit $exec ; GFX908-PEI-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF ; GFX908-PEI-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF - ; GFX908-PEI-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5) + ; GFX908-PEI-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 + ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5) ; GFX908-PEI-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec - ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) + ; GFX908-PEI-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) ; GFX908-PEI-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2 ; GFX908-PEI-NEXT: S_ENDPGM 0 ; ; GFX908-PEI-MACHINECP-LABEL: name: agpr-spill-to-vgpr-to-stack-machine-cp - ; GFX908-PEI-MACHINECP: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55 + ; GFX908-PEI-MACHINECP: liveins: $vgpr0, $vgpr1, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr48, $vgpr49, $vgpr50, $vgpr51, $vgpr52, $vgpr53, $vgpr54, $vgpr55, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-PEI-MACHINECP-NEXT: {{ $}} + ; GFX908-PEI-MACHINECP-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-PEI-MACHINECP-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-PEI-MACHINECP-NEXT: renamable $agpr0 = COPY renamable $vgpr0, implicit $exec + ; GFX908-PEI-MACHINECP-NEXT: renamable $agpr2 = COPY renamable $vgpr1, implicit $exec ; GFX908-PEI-MACHINECP-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = IMPLICIT_DEF ; GFX908-PEI-MACHINECP-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF - ; GFX908-PEI-MACHINECP-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5) + ; GFX908-PEI-MACHINECP-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 + ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5) ; GFX908-PEI-MACHINECP-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec - ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) + ; GFX908-PEI-MACHINECP-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) ; GFX908-PEI-MACHINECP-NEXT: $vgpr55 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2 ; GFX908-PEI-MACHINECP-NEXT: S_ENDPGM 0 renamable $agpr0 = COPY renamable $vgpr0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 135efceb31fd..59e47cbc12b2 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s -; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s ; TODO: Add global-isel when it can support bf16 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 9a647f04d43d..bc359d6ff3aa 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -25054,26 +25054,26 @@ define bfloat @v_log_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000 -; GCN-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; GCN-NEXT: v_sub_f32_e32 v3, v0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; GCN-NEXT: v_sub_f32_e32 v3, v0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; GCN-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 ; GCN-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3 ; GCN-NEXT: v_add_f32_e32 v3, v5, v3 -; GCN-NEXT: v_add_f32_e32 v1, v1, v3 +; GCN-NEXT: v_add_f32_e32 v2, v2, v3 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -25084,10 +25084,10 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 -; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_log_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -25109,10 +25109,10 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -25143,10 +25143,10 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -25174,8 +25174,9 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 @@ -25199,28 +25200,30 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log.bf16(bfloat %a) @@ -25233,14 +25236,14 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GCN-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -25251,10 +25254,10 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 -; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_log_f32_e32 v0, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -25267,10 +25270,10 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -25290,9 +25293,9 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x800000 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -25311,9 +25314,10 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -25330,20 +25334,21 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log2.bf16(bfloat %a) @@ -25356,26 +25361,26 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 -; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000 -; GCN-NEXT: v_mov_b32_e32 v2, 0x411a209b +; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; GCN-NEXT: v_sub_f32_e32 v3, v0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; GCN-NEXT: v_sub_f32_e32 v3, v0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; GCN-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 ; GCN-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 ; GCN-NEXT: v_add_f32_e32 v3, v4, v3 ; GCN-NEXT: v_add_f32_e32 v3, v5, v3 -; GCN-NEXT: v_add_f32_e32 v1, v1, v3 +; GCN-NEXT: v_add_f32_e32 v2, v2, v3 ; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -25386,10 +25391,10 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 -; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_log_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -25411,10 +25416,10 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -25445,10 +25450,10 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -25476,8 +25481,9 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 @@ -25501,28 +25507,30 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 ; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.log10.bf16(bfloat %a) @@ -25719,14 +25727,14 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GCN-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GCN-NEXT: v_not_b32_e32 v2, 63 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -25741,9 +25749,9 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_exp_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_not_b32_e32 v1, 63 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -25757,9 +25765,9 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 @@ -25779,10 +25787,10 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -25797,10 +25805,10 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -25816,12 +25824,12 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll index 2b4bea186f78..de2e25651271 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s ; Test that materialization constants that are the bit reversed of ; inline immediates are replaced with bfrev of the inline immediate to diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 1d984bd49756..ff47c865c67e 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -297,10 +297,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_eq_f32 s0, 0 -; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 vcc_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_vccz .LBB2_1 +; GFX12-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX12-NEXT: ; %bb.3: ; %bb0 ; GFX12-NEXT: s_getpc_b64 s[2:3] ; GFX12-NEXT: .Lpost_getpc2: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll index 84ea2beb8d04..384beae07ce2 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -13,122 +13,119 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg0, addrspace 6) ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0, align 16, addrspace 6) - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 8 - ; GCN-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], killed [[S_MOV_B32_1]], implicit-def dead $scc - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE killed [[S_ADD_I32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[REG_SEQUENCE1]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0 + 8, basealign 16, addrspace 6) - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 - ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, killed [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 8, 0 :: (dereferenceable invariant load (s64) from %ir.arg0 + 8, basealign 16, addrspace 6) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[COPY3]], %subreg.sub2, killed [[COPY2]], %subreg.sub3 ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 64 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 128 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 72 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 144 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 80 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 160 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY10]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_6]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 88 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 176 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 96 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 192 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE1]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 104 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 208 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208 + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE1]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 112 + ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 224 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224 ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) @@ -137,27 +134,27 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE1]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE1]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 120 + ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 ; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 240 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240 ; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) @@ -165,26 +162,26 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE1]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE1]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256 ; GCN-NEXT: [[COPY40:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY42:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY41]], [[S_LOAD_DWORDX4_IMM]], [[COPY42]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) @@ -192,54 +189,54 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY43]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY43]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY44]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY44]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY45]], [[REG_SEQUENCE2]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_18]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY45]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY47:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY46]], [[REG_SEQUENCE2]], [[COPY47]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY46]], [[REG_SEQUENCE1]], [[COPY47]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY48:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY48]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 136 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY48]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136 ; GCN-NEXT: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 272 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272 ; GCN-NEXT: [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY52:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY53:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY53]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY53]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY54:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY54]], [[REG_SEQUENCE2]], [[S_MOV_B32_19]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY54]], [[REG_SEQUENCE1]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY55:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY55]], [[REG_SEQUENCE2]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_20]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY55]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY56:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY57:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY56]], [[REG_SEQUENCE2]], [[COPY57]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY56]], [[REG_SEQUENCE1]], [[COPY57]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 288 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288 ; GCN-NEXT: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY62:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) @@ -247,44 +244,44 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY64]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY65]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN1]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY66:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY67:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY66]], [[REG_SEQUENCE1]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 152 + ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 ; GCN-NEXT: [[COPY69:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 304 + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304 ; GCN-NEXT: [[COPY70:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY71:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY73:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY74:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY74]], [[REG_SEQUENCE1]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY75:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY75]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY76:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY77:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY76]], [[REG_SEQUENCE1]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: S_ENDPGM 0 bb.0: %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0 diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 5bbea7ecf3f2..5dde193528aa 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -1021,8 +1021,116 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) { ; GFX9-LABEL: sdiv64_known32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GFX9-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 +; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, 0, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v0, vcc +; GFX9-NEXT: v_madmk_f32 v1, v3, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_madmk_f32 v1, v3, 0xcf800000, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_lo_u32 v5, v11, v10 +; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 +; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 +; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v9, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v13, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v10, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, v11, v13 +; GFX9-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0 +; GFX9-NEXT: v_add3_u32 v8, v4, v5, v8 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0 +; GFX9-NEXT: v_mul_hi_u32 v12, v1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v12, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v11, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0 +; GFX9-NEXT: v_mul_hi_u32 v8, v7, v1 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v8, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v10, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v0, v1 +; GFX9-NEXT: v_mul_lo_u32 v9, v6, v5 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0 +; GFX9-NEXT: v_add3_u32 v4, v4, v9, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, v2, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v7, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v3, v6 +; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 +; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: .LBB10_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] +; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, 0, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1033,14 +1141,17 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_lo_u32 v2, v0, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX9-NEXT: .LBB10_4: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.ext = ashr i64 %a, 32 %b.ext = ashr i64 %b, 32 diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 1b35a89ad7f9..4011c21af690 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -139,10 +139,6 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) { ; GISEL-LABEL: csh_v4i32: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v4, 31, v4 -; GISEL-NEXT: v_and_b32_e32 v5, 31, v5 -; GISEL-NEXT: v_and_b32_e32 v6, 31, v6 -; GISEL-NEXT: v_and_b32_e32 v7, 31, v7 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, v4, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, v5, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, v6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index b2e4117096ce..fdc9704a3784 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -337,9 +337,9 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 0x800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e64 v3, |v0|, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX9-NEXT: v_ldexp_f32 v3, |v0|, v3 ; GFX9-NEXT: v_log_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -353,10 +353,10 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc +; GFX9-NEXT: v_not_b32_e32 v3, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX9-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index cce0fb7e003c..5b72795ba07e 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -270,40 +270,36 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool. ; GFX7-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, 0x6d800000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x5d000000 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x5c ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, 59, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x6d800000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x5d000000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x5c ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 59, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x5d000000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x6d800000, v3, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo +; GFX1030-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x5d000000 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x6d800000, v3, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 0x43A0000000000000, float 0x45B0000000000000 @@ -440,40 +436,36 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool ; GFX7-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, 0xdb800000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xe6800000 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x4e ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc +; GFX7-NEXT: v_ldexp_f32_e64 v0, -v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xdb800000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xe6800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4e ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc +; GFX9-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0xe6800000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xdb800000, v3, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo +; GFX1030-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0xe6800000 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xdb800000, v3, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 0xC4D0000000000000, float 0xC370000000000000 @@ -485,40 +477,40 @@ define float @fmul_select_f32_test12_sel_log2val_neg48_pos68(float %x, i32 %bool ; GFX7-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, 0x61800000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x27800000 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x44 +; GFX7-NEXT: v_not_b32_e32 v4, 47 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x61800000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x27800000 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x44 +; GFX9-NEXT: v_not_b32_e32 v4, 47 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x27800000 +; GFX1030-NEXT: v_not_b32_e32 v3, 47 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x61800000, v3, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo +; GFX1030-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x27800000 +; GFX1100-NEXT: v_not_b32_e32 v3, 47 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x61800000, v3, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo +; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 0x3CF0000000000000, float 0x4430000000000000 @@ -530,40 +522,34 @@ define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_f64_test1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f64_test1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f64_test1: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0x3ff00000, 2.0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f64_test1: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_mov_b32_e32 v4, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0x3ff00000, 2.0, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 2.000000e+00, double 1.000000e+00 @@ -575,43 +561,34 @@ define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_f64_test2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0x3ff00000 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x3fe00000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f64_test2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3ff00000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3fe00000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f64_test2: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x3fe00000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e32 v5, 0x3ff00000, v5, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f64_test2: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v5, 0x3fe00000 :: v_dual_mov_b32 v4, 0 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v5, 0x3ff00000, v5, vcc_lo -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 5.000000e-01, double 1.000000e+00 @@ -623,57 +600,46 @@ define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.ar ; GFX7-LABEL: fmul_select_v2f64_test3: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, 0x3ff00000 -; GFX7-NEXT: v_mov_b32_e32 v8, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v11, 2.0, vcc -; GFX7-NEXT: v_mov_b32_e32 v9, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v11, 2.0, vcc -; GFX7-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX7-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_v2f64_test3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0x3ff00000 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, 2.0, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, 2.0, vcc -; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_v2f64_test3: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v11, 0x3ff00000, 2.0, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_mov_b32_e32 v10, v8 -; GFX1030-NEXT: v_cndmask_b32_e64 v9, 0x3ff00000, 2.0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1030-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_v2f64_test3: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1100-NEXT: v_mov_b32_e32 v8, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v11, 0x3ff00000, 2.0, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_mov_b32_e32 v10, v8 -; GFX1100-NEXT: v_cndmask_b32_e64 v9, 0x3ff00000, 2.0, vcc_lo -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00> @@ -685,59 +651,46 @@ define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.ar ; GFX7-LABEL: fmul_select_v2f64_test4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, 0x3ff00000 -; GFX7-NEXT: v_mov_b32_e32 v12, 0x3fe00000 -; GFX7-NEXT: v_mov_b32_e32 v8, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; GFX7-NEXT: v_mov_b32_e32 v9, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX7-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; GFX7-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX7-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_v2f64_test4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0x3ff00000 -; GFX9-NEXT: v_mov_b32_e32 v12, 0x3fe00000 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_v2f64_test4: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x3fe00000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: v_cndmask_b32_e32 v11, 0x3ff00000, v9, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_mov_b32_e32 v10, v8 -; GFX1030-NEXT: v_cndmask_b32_e32 v9, 0x3ff00000, v9, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1030-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_v2f64_test4: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v9, 0x3fe00000 :: v_dual_mov_b32 v8, 0 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_dual_mov_b32 v10, v8 :: v_dual_cndmask_b32 v11, 0x3ff00000, v9 +; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1100-NEXT: v_cndmask_b32_e32 v9, 0x3ff00000, v9, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 5.000000e-01>, <2 x double> <double 1.000000e+00, double 1.000000e+00> @@ -749,43 +702,34 @@ define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_f64_test5: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0xbff00000 -; GFX7-NEXT: v_mov_b32_e32 v5, 0xbfe00000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX7-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f64_test5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xbff00000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xbfe00000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f64_test5: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0xbfe00000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e32 v5, 0xbff00000, v5, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f64_test5: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v5, 0xbfe00000 :: v_dual_mov_b32 v4, 0 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v5, 0xbff00000, v5, vcc_lo -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double -5.000000e-01, double -1.000000e+00 @@ -797,40 +741,34 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_f64_test6: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f64_test6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f64_test6: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, -2.0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f64_test6: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_mov_b32_e32 v4, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, -2.0, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double -2.000000e+00, double -1.000000e+00 @@ -887,43 +825,34 @@ define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_f64_test8: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0xc0400000 -; GFX7-NEXT: v_mov_b32_e32 v5, 0xc0100000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc +; GFX7-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f64_test8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc0400000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xc0100000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f64_test8: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0xc0100000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e32 v5, 0xc0400000, v5, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1030-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f64_test8: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v5, 0xc0100000 :: v_dual_mov_b32 v4, 0 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v5, 0xc0400000, v5, vcc_lo -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double -4.000000e+00, double -3.200000e+01 @@ -935,57 +864,46 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar ; GFX7-LABEL: fmul_select_v2f64_test9: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, 0xbff00000 -; GFX7-NEXT: v_mov_b32_e32 v8, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: v_cndmask_b32_e64 v10, v11, -2.0, vcc -; GFX7-NEXT: v_mov_b32_e32 v9, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v11, -2.0, vcc -; GFX7-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX7-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_v2f64_test9: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0xbff00000 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v11, -2.0, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, -2.0, vcc -; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_v2f64_test9: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v11, 0xbff00000, -2.0, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_mov_b32_e32 v10, v8 -; GFX1030-NEXT: v_cndmask_b32_e64 v9, 0xbff00000, -2.0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1030-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 +; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_v2f64_test9: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1100-NEXT: v_mov_b32_e32 v8, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v11, 0xbff00000, -2.0, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_mov_b32_e32 v10, v8 -; GFX1100-NEXT: v_cndmask_b32_e64 v9, 0xbff00000, -2.0, vcc_lo -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 +; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX1100-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double -2.000000e+00, double -2.000000e+00>, <2 x double> <double -1.000000e+00, double -1.000000e+00> @@ -997,61 +915,56 @@ define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.a ; GFX7-LABEL: fmul_select_v2f64_test10: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, 0 -; GFX7-NEXT: v_mov_b32_e32 v9, 0xbff00000 -; GFX7-NEXT: v_mov_b32_e32 v10, 0x3fe00000 +; GFX7-NEXT: v_mov_b32_e32 v8, 0xbff00000 +; GFX7-NEXT: v_mov_b32_e32 v9, 0x3fe00000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: v_mov_b32_e32 v11, 0x3ff00000 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v9, v10, vcc -; GFX7-NEXT: v_mov_b32_e32 v9, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v11, 2.0, vcc -; GFX7-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v8, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX7-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_v2f64_test10: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xbff00000 -; GFX9-NEXT: v_mov_b32_e32 v10, 0x3fe00000 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xbff00000 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x3fe00000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v11, 0x3ff00000 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v10, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[9:10] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, 2.0, vcc -; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_v2f64_test10: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v9, 0x3fe00000 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x3fe00000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: v_cndmask_b32_e32 v11, 0xbff00000, v9, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_mov_b32_e32 v10, v8 -; GFX1030-NEXT: v_cndmask_b32_e64 v9, 0x3ff00000, 2.0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1030-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1030-NEXT: v_mov_b32_e32 v8, 0 +; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_v2f64_test10: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v9, 0x3fe00000 :: v_dual_mov_b32 v8, 0 +; GFX1100-NEXT: v_mov_b32_e32 v8, 0x3fe00000 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_dual_mov_b32 v10, v8 :: v_dual_cndmask_b32 v11, 0xbff00000, v9 +; GFX1100-NEXT: v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[10:11] -; GFX1100-NEXT: v_cndmask_b32_e64 v9, 0x3ff00000, 2.0, vcc_lo +; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f64 v[2:3], v[2:3], v[8:9] +; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 2.000000e+00>, <2 x double> <double -1.000000e+00, double 1.000000e+00> @@ -1199,43 +1112,40 @@ define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bo ; GFX7-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0x3e400000 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x45b00000 +; GFX7-NEXT: v_not_b32_e32 v4, 26 +; GFX7-NEXT: v_mov_b32_e32 v5, 0x5c ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3e400000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x45b00000 +; GFX9-NEXT: v_not_b32_e32 v4, 26 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x5c ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x45b00000 +; GFX1030-NEXT: v_mov_b32_e32 v4, 0x5c ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e32 v5, 0x3e400000, v5, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v5, 0x45b00000 :: v_dual_mov_b32 v4, 0 +; GFX1100-NEXT: v_mov_b32_e32 v4, 0x5c ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v5, 0x3e400000, v5, vcc_lo -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo +; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 0x45B0000000000000, double 0x3E40000000000000 @@ -1247,43 +1157,40 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo ; GFX7-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0x3de00000 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x3d500000 +; GFX7-NEXT: v_not_b32_e32 v4, 32 +; GFX7-NEXT: v_not_b32_e32 v5, 41 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3de00000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3d500000 +; GFX9-NEXT: v_not_b32_e32 v4, 32 +; GFX9-NEXT: v_not_b32_e32 v5, 41 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x3d500000 +; GFX1030-NEXT: v_not_b32_e32 v4, 41 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e32 v5, 0x3de00000, v5, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo +; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v5, 0x3d500000 :: v_dual_mov_b32 v4, 0 +; GFX1100-NEXT: v_not_b32_e32 v4, 41 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v5, 0x3de00000, v5, vcc_lo -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo +; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 0x3D50000000000000, double 0x3DE0000000000000 @@ -1298,38 +1205,34 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f16_test1: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f16_test1: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 2.000000e+00, half 1.000000e+00 @@ -1343,38 +1246,41 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3800 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f16_test2: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x3800 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1030-NEXT: s_movk_i32 s4, 0x8000 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff +; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f16_test2: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x3800 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1100-NEXT: s_movk_i32 s0, 0x8000 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 5.000000e-01, half 1.000000e+00 @@ -1507,40 +1413,35 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4800 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc +; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f16_test5: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4800, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo +; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f16_test5: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4800, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 2.000000e+00, half 8.000000e+00 @@ -1688,42 +1589,36 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-LABEL: fmul_select_f16_test9: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xc2000000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1800000 +; GFX7-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test9: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xd000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xcc00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc +; GFX9-NEXT: v_ldexp_f16_e64 v0, -v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f16_test9: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0xcc00 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xd000, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo +; GFX1030-NEXT: v_ldexp_f16_e64 v0, -v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f16_test9: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0xcc00 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xd000, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f16_e64 v0, -v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01 @@ -1736,41 +1631,42 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x45000000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x3a000000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x6800 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x1000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x1000 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x6800, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1030-NEXT: s_movk_i32 s4, 0x8000 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo +; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff +; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x1000 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x6800, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1100-NEXT: s_movk_i32 s0, 0x8000 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 0xH1000, half 0xH6800 @@ -1783,41 +1679,42 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x38800000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x43000000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX1030-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x400, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1030-NEXT: s_movk_i32 s4, 0x8000 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo +; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff +; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x400, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1100-NEXT: s_movk_i32 s0, 0x8000 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 0xH5800, half 0xH0400 diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index aa182b720c60..510ee9c1a23f 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -27,7 +27,7 @@ define internal void @direct() { define amdgpu_kernel void @test_direct_indirect_call() { ; CHECK-LABEL: define {{[^@]+}}@test_direct_indirect_call -; CHECK-SAME: () #[[ATTR1]] { +; CHECK-SAME: () #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: call void @direct() ; CHECK-NEXT: ret void ; @@ -35,6 +35,7 @@ define amdgpu_kernel void @test_direct_indirect_call() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index b2f178c6c104..d9182d7ace8b 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1455,7 +1455,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0x7f ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr3_vgpr4 killed $exec ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 @@ -1544,77 +1544,77 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11] -; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[8:9], v[10:11] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[10:11] +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[8:9], v[10:11] ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 ; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v7 -; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: s_mov_b32 s12, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s12 ; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v4, v7 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v3, v3 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 ; GFX9-G-O0-NEXT: v_min_u32_e64 v3, v3, v6 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[10:11] ; GFX9-G-O0-NEXT: s_mov_b32 s16, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[6:7], v[8:9] ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v1 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 ; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s12 ; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v4, v6 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v4, v2 ; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v5 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 ; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 ; GFX9-G-O0-NEXT: v_min_u32_e64 v4, v4, v7 -; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[10:11] ; GFX9-G-O0-NEXT: s_mov_b32 s15, 0 -; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s13, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 -; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 -; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v3, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s12, 0 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[10:11], v3, v4 ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s16 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[8:9], v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s15 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v4, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[10:11], v4, v7, s[10:11] ; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[10:11], v4, v7, s[10:11] ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s13 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[11:12], v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v4, v9, s[12:13] @@ -3688,20 +3688,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 ; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v4, v7, s[8:9] ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v7 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[11:12], v[13:14] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s5 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s4 -; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[11:12], v[13:14] -; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s13 -; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[11:12], v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v4, v9, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir new file mode 100644 index 000000000000..5c7c07632f0d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=default -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=dummy -o - %s | FileCheck -check-prefixes=CHECK,DUMMY %s + +# Check that the regalloc-enable-priority-advisor=dummy option works +# and the result is different from the default. Ordinarily %1 would be +# prioritized higher than %0 due to the register class priority + +--- +name: foo +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; DEFAULT-LABEL: name: foo + ; DEFAULT: liveins: $vgpr0, $vgpr1 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec + ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + ; + ; DUMMY-LABEL: name: foo + ; DUMMY: liveins: $vgpr0, $vgpr1 + ; DUMMY-NEXT: {{ $}} + ; DUMMY-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec + ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + undef %1.sub0:vreg_128 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + %2:vgpr_32 = V_ADD_U32_e32 %1.sub0, %0, implicit $exec + $vgpr3 = COPY %2 + SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + +... + +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 848019c87292..c430c41f5914 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 1c093bf31ea7..73aa87e5c55d 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -5,8 +5,188 @@ target datalayout = "A5" ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define amdgpu_kernel void @test_dynamic_stackalloc(ptr addrspace(1) %out, i32 %n) { +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { %alloca = alloca i32, i32 %n, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 128, addrspace(5) + store volatile i32 10, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 2, addrspace(5) + store volatile i32 22, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca float, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 128, addrspace(5) + store volatile i32 444, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i128, i32 %idx, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + %alloca1 = alloca i32, i32 8, addrspace(5) + %alloca2 = alloca i17, i32 %n, addrspace(5) + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca3 = alloca i32, i32 %m, align 64, addrspace(5) + %alloca4 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 3, ptr addrspace(5) %alloca3 + store volatile i32 4, ptr addrspace(5) %alloca4 + br label %bb.1 +bb.1: + store volatile i32 1, ptr addrspace(5) %alloca1 + store volatile i32 2, ptr addrspace(5) %alloca2 + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %alloca2 = alloca i32, i32 %m, align 64, addrspace(5) + store volatile i32 2, ptr addrspace(5) %alloca2 + br label %bb.2 +bb.1: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca1 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca1 + br label %bb.2 +bb.2: + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_uniform(i32 %n) { + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 128, addrspace(5) + store volatile i32 10, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 2, addrspace(5) + store volatile i32 22, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent_over_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 128, addrspace(5) + store volatile i32 444, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent_under_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + %alloca1 = alloca i32, i32 8, addrspace(5) + %alloca2 = alloca i32, i32 %n, addrspace(5) + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca3 = alloca i32, i32 %m, align 64, addrspace(5) + %alloca4 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 3, ptr addrspace(5) %alloca3 + store volatile i32 4, ptr addrspace(5) %alloca4 + br label %bb.1 +bb.1: + store volatile i32 1, ptr addrspace(5) %alloca1 + store volatile i32 2, ptr addrspace(5) %alloca2 + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca1 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca1 + br label %bb.2 +bb.1: + %alloca2 = alloca i32, i32 %m, align 64, addrspace(5) + store volatile i32 2, ptr addrspace(5) %alloca2 + br label %bb.2 +bb.2: ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll index 2140f50611d7..ebfb5e9ccaa3 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -1613,14 +1613,12 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; CODEGEN-IEEE-SDAG: ; %bb.0: ; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v1, 0x4b800000 ; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 -; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v1, 0x45800000 -; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract: diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll index 695042d44d87..776816d6aa0e 100644 --- a/llvm/test/CodeGen/AMDGPU/fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED ; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions @@ -21,6 +22,7 @@ ; GFX906-CONTRACT: v_mac_f16_e32 ; GFX906-DENORM-CONTRACT: v_fma_f16 +; GFX906-DOT10-DISABLED: v_fma_f16 define amdgpu_kernel void @dotproduct_f16(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -44,8 +46,11 @@ entry: } -; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 -; and the vectors are of type <2 x half> +; We only want to generate fdot2 if: +; - vector element of dot product is converted from f16 to f32, and +; - the vectors are of type <2 x half>, and +; - "dot10-insts" is enabled + ; GCN-LABEL: {{^}}dotproduct_f16_f32 ; GFX900: v_mad_mix_f32 ; GFX900: v_mad_mix_f32 @@ -59,6 +64,7 @@ entry: ; GFX906-CONTRACT: v_dot2_f32_f16 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @dotproduct_f16_f32(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -85,8 +91,11 @@ entry: ret void } -; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 -; and the vectors are of type <2 x half> +; We only want to generate fdot2 if: +; - vector element of dot product is converted from f16 to f32, and +; - the vectors are of type <2 x half>, and +; - "dot10-insts" is enabled + ; GCN-LABEL: {{^}}dotproduct_diffvecorder ; GFX900: v_mad_mix_f32 ; GFX900: v_mad_mix_f32 @@ -99,6 +108,7 @@ entry: ; GFX906-CONTRACT: v_dot2_f32_f16 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @dotproduct_diffvecorder(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -136,6 +146,7 @@ entry: ; GFX906-CONTRACT: v_fma_mix_f32 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @dotproduct_v4f16(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -173,6 +184,7 @@ entry: ; GFX906-CONTRACT: v_fma_mix_f32 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @NotAdotproduct(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -210,6 +222,7 @@ entry: ; GFX906-CONTRACT: v_fma_mix_f32 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @Diff_Idx_NotAdotproduct(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 12593e3760fd..4c68b8d35260 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 { } ; GCN-LABEL: {{^}}store_flat_i64: -; GCN: flat_store_{{dwordx2|b64}} +; GCN: flat_store_{{dword|b64}} define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr store volatile i64 %x, ptr %fptr, align 8 @@ -28,7 +28,7 @@ define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 { } ; GCN-LABEL: {{^}}store_flat_v4i32: -; GCN: flat_store_{{dwordx4|b128}} +; GCN: flat_store_{{dword|b128}} define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr store volatile <4 x i32> %x, ptr %fptr, align 16 @@ -65,7 +65,7 @@ define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addr } ; GCN-LABEL: load_flat_i64: -; GCN: flat_load_{{dwordx2|b64}} +; GCN: flat_load_{{dword|b64}} define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr %fload = load volatile i64, ptr %fptr, align 8 @@ -74,7 +74,7 @@ define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addr } ; GCN-LABEL: load_flat_v4i32: -; GCN: flat_load_{{dwordx4|b128}} +; GCN: flat_load_{{dword|b128}} define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { %fptr = addrspacecast ptr addrspace(1) %gptr to ptr %fload = load volatile <4 x i32>, ptr %fptr, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 87d63f4f9cd9..45223a24e021 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -96,9 +96,9 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: TotalNumSgprs: 4 -; VI-NOXNACK: TotalNumSgprs: 6 -; VI-XNACK: TotalNumSgprs: 6 +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 ; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 ; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { @@ -113,9 +113,9 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: TotalNumSgprs: 4 -; VI-NOXNACK: TotalNumSgprs: 6 -; VI-XNACK: TotalNumSgprs: 6 +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 ; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 ; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { @@ -130,9 +130,9 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: TotalNumSgprs: 4 -; VI-NOXNACK: TotalNumSgprs: 6 -; VI-XNACK: TotalNumSgprs: 6 +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 ; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 ; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 711a5fff1a06..104e157e9e15 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -3307,22 +3307,54 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) { ; -------------------------------------------------------------------- define float @v_mul_f32_select_64_1(i32 %arg, float %x) { -; GFX9-LABEL: v_mul_f32_select_64_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_mul_f32_select_64_1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_f32_select_64_1: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo -; GFX1011-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_mul_f32_select_64_1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_mul_f32_select_64_1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_f32_select_64_1: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_f32_select_64_1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_f32_select_64_1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, float 64.0, float 1.0 %mul = fmul float %x, %select.pow2 @@ -3330,22 +3362,54 @@ define float @v_mul_f32_select_64_1(i32 %arg, float %x) { } define float @v_mul_f32_select_1_64(i32 %arg, float %x) { -; GFX9-LABEL: v_mul_f32_select_1_64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 1.0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_mul_f32_select_1_64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_f32_select_1_64: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo -; GFX1011-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_mul_f32_select_1_64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 1.0, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_mul_f32_select_1_64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_f32_select_1_64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_f32_select_1_64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_f32_select_1_64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, float 1.0, float 64.0 %mul = fmul float %x, %select.pow2 @@ -3353,22 +3417,54 @@ define float @v_mul_f32_select_1_64(i32 %arg, float %x) { } define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) { -; GFX9-LABEL: v_mul_f32_select_n1_n64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0xc2800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1.0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_mul_f32_select_n1_n64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_f32_select_n1_n64: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo -; GFX1011-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_mul_f32_select_n1_n64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1.0, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_mul_f32_select_n1_n64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_f32_select_n1_n64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_f32_select_n1_n64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_f32_select_n1_n64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, float -1.0, float -64.0 %mul = fmul float %x, %select.pow2 @@ -3376,22 +3472,54 @@ define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) { } define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) { -; GFX9-LABEL: v_mul_f32_select_n64_n1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0xc2800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_mul_f32_select_n64_n1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_mul_f32_select_n64_n1: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo -; GFX1011-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_mul_f32_select_n64_n1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_mul_f32_select_n64_n1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_mul_f32_select_n64_n1: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_mul_f32_select_n64_n1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_mul_f32_select_n64_n1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, float -64.0, float -1.0 %mul = fmul float %x, %select.pow2 @@ -3402,11 +3530,9 @@ define float @v_mul_f32_select_128_64(i32 %arg, float %x) { ; GFX9-SDAG-LABEL: v_mul_f32_select_128_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x43000000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f32_select_128_64: @@ -3422,10 +3548,9 @@ define float @v_mul_f32_select_128_64(i32 %arg, float %x) { ; GFX10-SDAG-LABEL: v_mul_f32_select_128_64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0x43000000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x42800000, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f32_select_128_64: @@ -3440,10 +3565,9 @@ define float @v_mul_f32_select_128_64(i32 %arg, float %x) { ; GFX11-SDAG-LABEL: v_mul_f32_select_128_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x43000000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x42800000, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f32_select_128_64: @@ -3464,11 +3588,9 @@ define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) { ; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2800000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc3000000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64: @@ -3484,10 +3606,9 @@ define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) { ; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0xc3000000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xc2800000, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64: @@ -3502,10 +3623,9 @@ define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) { ; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0xc3000000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xc2800000, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64: @@ -3526,11 +3646,9 @@ define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) { ; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n16: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0xc1800000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc3000000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n16: @@ -3546,10 +3664,9 @@ define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) { ; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n16: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0xc3000000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xc1800000, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n16: @@ -3564,10 +3681,9 @@ define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) { ; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0xc3000000 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xc1800000, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n16: @@ -3818,12 +3934,9 @@ define double @v_mul_f64_select_64_1(i32 %arg, double %x) { ; GFX9-SDAG-LABEL: v_mul_f64_select_64_1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x40500000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f64_select_64_1: @@ -3840,11 +3953,9 @@ define double @v_mul_f64_select_64_1(i32 %arg, double %x) { ; GFX10-SDAG-LABEL: v_mul_f64_select_64_1: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x40500000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, 0x3ff00000, v4, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f64_select_64_1: @@ -3860,10 +3971,9 @@ define double @v_mul_f64_select_64_1(i32 %arg, double %x) { ; GFX11-SDAG-LABEL: v_mul_f64_select_64_1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v4, 0x3ff00000, v4, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f64_select_64_1: @@ -3884,12 +3994,9 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) { ; GFX9-SDAG-LABEL: v_mul_f64_select_1_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x40500000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f64_select_1_64: @@ -3906,11 +4013,9 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) { ; GFX10-SDAG-LABEL: v_mul_f64_select_1_64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f64_select_1_64: @@ -3926,10 +4031,9 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) { ; GFX11-SDAG-LABEL: v_mul_f64_select_1_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f64_select_1_64: @@ -3950,12 +4054,9 @@ define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) { ; GFX9-SDAG-LABEL: v_mul_f64_select_n1_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc0500000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f64_select_n1_n64: @@ -3972,11 +4073,9 @@ define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) { ; GFX10-SDAG-LABEL: v_mul_f64_select_n1_n64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f64_select_n1_n64: @@ -3992,10 +4091,9 @@ define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) { ; GFX11-SDAG-LABEL: v_mul_f64_select_n1_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0xbff00000 :: v_dual_mov_b32 v3, 0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f64_select_n1_n64: @@ -4016,12 +4114,9 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) { ; GFX9-SDAG-LABEL: v_mul_f64_select_128_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x40500000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x40600000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f64_select_128_64: @@ -4038,11 +4133,9 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) { ; GFX10-SDAG-LABEL: v_mul_f64_select_128_64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x40600000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f64_select_128_64: @@ -4058,10 +4151,9 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) { ; GFX11-SDAG-LABEL: v_mul_f64_select_128_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0x40600000 :: v_dual_mov_b32 v3, 0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f64_select_128_64: @@ -4082,12 +4174,9 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) { ; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc0500000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0600000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n64: @@ -4104,11 +4193,9 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) { ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0600000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n64: @@ -4124,10 +4211,9 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) { ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0xc0600000 :: v_dual_mov_b32 v3, 0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n64: @@ -4148,12 +4234,9 @@ define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) { ; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n16: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc0300000 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0600000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n16: @@ -4170,11 +4253,9 @@ define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) { ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n16: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0600000 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, 0xc0300000, v4, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n16: @@ -4190,10 +4271,9 @@ define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) { ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0xc0600000 :: v_dual_mov_b32 v3, 0 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v4, 0xc0300000, v4, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n16: @@ -4719,11 +4799,9 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) { ; GFX9-SDAG-LABEL: v_mul_f16_select_64_1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f16_select_64_1: @@ -4739,10 +4817,9 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) { ; GFX10-SDAG-LABEL: v_mul_f16_select_64_1: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0x5400 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f16_select_64_1: @@ -4757,10 +4834,9 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) { ; GFX11-SDAG-LABEL: v_mul_f16_select_64_1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x5400 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f16_select_64_1: @@ -4781,11 +4857,9 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) { ; GFX9-SDAG-LABEL: v_mul_f16_select_1_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x5400 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f16_select_1_64: @@ -4801,10 +4875,9 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) { ; GFX10-SDAG-LABEL: v_mul_f16_select_1_64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f16_select_1_64: @@ -4819,10 +4892,9 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) { ; GFX11-SDAG-LABEL: v_mul_f16_select_1_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f16_select_1_64: @@ -4843,11 +4915,9 @@ define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) { ; GFX9-SDAG-LABEL: v_mul_f16_select_n1_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0xd400 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xbc00 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f16_select_n1_n64: @@ -4863,10 +4933,9 @@ define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) { ; GFX10-SDAG-LABEL: v_mul_f16_select_n1_n64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0xbc00 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f16_select_n1_n64: @@ -4881,10 +4950,9 @@ define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) { ; GFX11-SDAG-LABEL: v_mul_f16_select_n1_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0xbc00 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64: @@ -4905,11 +4973,9 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) { ; GFX9-SDAG-LABEL: v_mul_f16_select_128_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x5400 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f16_select_128_64: @@ -4925,10 +4991,9 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) { ; GFX10-SDAG-LABEL: v_mul_f16_select_128_64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0x5800 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f16_select_128_64: @@ -4943,10 +5008,9 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) { ; GFX11-SDAG-LABEL: v_mul_f16_select_128_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0x5800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f16_select_128_64: @@ -4967,11 +5031,9 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) { ; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0xd400 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n64: @@ -4987,10 +5049,9 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) { ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0xd800 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n64: @@ -5005,10 +5066,9 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) { ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0xd800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64: @@ -5029,11 +5089,9 @@ define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) { ; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n16: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0xcc00 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n16: @@ -5049,10 +5107,9 @@ define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) { ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n16: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0xd800 ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xcc00, v2, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n16: @@ -5067,10 +5124,9 @@ define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) { ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0xd800 ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xcc00, v2, vcc_lo -; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16: diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 8e04a240d0a1..b3001819e9aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -2368,14 +2368,12 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) { ; SDAG-IEEE: ; %bb.0: ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4b800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: v_rsq_f32_e32 v0, v0 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x45800000 -; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_rcp: @@ -2718,20 +2716,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) { ; SDAG-IEEE: ; %bb.0: ; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, 0x4b800000 ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc ; SDAG-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v0, v3 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[4:5] +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, s[4:5] ; SDAG-IEEE-NEXT: v_rsq_f32_e32 v0, v0 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; SDAG-IEEE-NEXT: v_rsq_f32_e32 v1, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, 0x45800000 -; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[4:5] -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, s[4:5] +; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp: diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll index 020c9dc130bb..61ae9639c52d 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll @@ -465,19 +465,12 @@ main_body: } define amdgpu_ps void @s_buffer_load_byte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) { -; DAG-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent: -; DAG: ; %bb.0: ; %main_body -; DAG-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen -; DAG-NEXT: s_wait_loadcnt 0x0 -; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_endpgm -; -; GISEL-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen -; GISEL-NEXT: s_wait_loadcnt 0x0 -; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_endpgm +; GCN-LABEL: s_buffer_load_byte_sgpr_or_imm_offset_divergent: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: buffer_load_i8 v2, v2, s[0:3], null offen +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %src, i32 %offset, i32 0) %sext = sext i8 %ld to i32 @@ -538,20 +531,12 @@ main_body: } define amdgpu_ps void @s_buffer_load_ubyte_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) { -; DAG-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent: -; DAG: ; %bb.0: ; %main_body -; DAG-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen -; DAG-NEXT: s_wait_loadcnt 0x0 -; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_endpgm -; -; GISEL-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen -; GISEL-NEXT: s_wait_loadcnt 0x0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_endpgm +; GCN-LABEL: s_buffer_load_ubyte_sgpr_or_imm_offset_divergent: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: buffer_load_u8 v2, v2, s[0:3], null offen +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm main_body: %ld = call i8 @llvm.amdgcn.s.buffer.load.u8(<4 x i32> %src, i32 %offset, i32 0) %zext = zext i8 %ld to i32 @@ -606,19 +591,12 @@ main_body: } define amdgpu_ps void @s_buffer_load_short_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) { -; DAG-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent: -; DAG: ; %bb.0: ; %main_body -; DAG-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen -; DAG-NEXT: s_wait_loadcnt 0x0 -; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_endpgm -; -; GISEL-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen -; GISEL-NEXT: s_wait_loadcnt 0x0 -; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_endpgm +; GCN-LABEL: s_buffer_load_short_sgpr_or_imm_offset_divergent: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: buffer_load_i16 v2, v2, s[0:3], null offen +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.i16(<4 x i32> %src, i32 %offset, i32 0) %sext = sext i16 %ld to i32 @@ -679,20 +657,12 @@ main_body: } define amdgpu_ps void @s_buffer_load_ushort_sgpr_or_imm_offset_divergent(<4 x i32> inreg %src, ptr addrspace(1) nocapture %out, i32 %offset) { -; DAG-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent: -; DAG: ; %bb.0: ; %main_body -; DAG-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen -; DAG-NEXT: s_wait_loadcnt 0x0 -; DAG-NEXT: global_store_b32 v[0:1], v2, off -; DAG-NEXT: s_endpgm -; -; GISEL-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: buffer_load_b32 v2, v2, s[0:3], null offen -; GISEL-NEXT: s_wait_loadcnt 0x0 -; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GISEL-NEXT: global_store_b32 v[0:1], v2, off -; GISEL-NEXT: s_endpgm +; GCN-LABEL: s_buffer_load_ushort_sgpr_or_imm_offset_divergent: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: buffer_load_u16 v2, v2, s[0:3], null offen +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_store_b32 v[0:1], v2, off +; GCN-NEXT: s_endpgm main_body: %ld = call i16 @llvm.amdgcn.s.buffer.load.u16(<4 x i32> %src, i32 %offset, i32 0) %zext = zext i16 %ld to i32 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index 6b4a6381d954..bfd57aebad52 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s ; Test using saddr addressing mode of global_* flat atomic instructions. @@ -85,6 +86,34 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst @@ -168,6 +197,34 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -245,6 +302,31 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst @@ -320,6 +402,31 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -415,6 +522,38 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst @@ -510,6 +649,38 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -596,6 +767,34 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst @@ -680,6 +879,34 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -767,6 +994,34 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst @@ -850,6 +1105,34 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -927,6 +1210,31 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst @@ -1002,6 +1310,31 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1097,6 +1430,38 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst @@ -1192,6 +1557,38 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1278,6 +1675,34 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst @@ -1362,6 +1787,34 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1449,6 +1902,34 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst @@ -1532,6 +2013,34 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1609,6 +2118,31 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst @@ -1684,6 +2218,31 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1779,6 +2338,38 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst @@ -1874,6 +2465,38 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1960,6 +2583,34 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst @@ -2044,6 +2695,34 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2131,6 +2810,34 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst @@ -2214,6 +2921,34 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2291,6 +3026,31 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst @@ -2366,6 +3126,31 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2461,6 +3246,38 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst @@ -2556,6 +3373,38 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2642,6 +3491,34 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst @@ -2726,6 +3603,34 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll index fb72dcacee4c..a7225a104ff3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s ; Test using saddr addressing mode of global_* flat atomic instructions. @@ -28,6 +29,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -58,6 +66,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i32_nortn_offset_2047: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047 @@ -89,6 +104,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(ptr addrspace( ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048 @@ -119,6 +141,13 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -153,6 +182,13 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i32_rtn_2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048 @@ -184,6 +220,13 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i32_rtn_neg2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048 @@ -238,6 +281,18 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -286,6 +341,18 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -335,6 +402,18 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -382,6 +461,18 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -421,6 +512,13 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -451,6 +549,13 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -482,6 +587,13 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -511,6 +623,13 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -545,6 +664,13 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -575,6 +701,13 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -606,6 +739,13 @@ define amdgpu_ps void @global_add_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -635,6 +775,13 @@ define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -665,6 +812,13 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -695,6 +849,13 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -726,6 +887,13 @@ define amdgpu_ps void @global_add_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -755,6 +923,13 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -789,6 +964,13 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -819,6 +1001,13 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -850,6 +1039,13 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -879,6 +1075,13 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -909,6 +1112,13 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -939,6 +1149,13 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -970,6 +1187,13 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -999,6 +1223,13 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1033,6 +1264,13 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1063,6 +1301,13 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1094,6 +1339,13 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1123,6 +1375,13 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1153,6 +1412,13 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1183,6 +1449,13 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1214,6 +1487,13 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1243,6 +1523,13 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1277,6 +1564,13 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1307,6 +1601,13 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1338,6 +1639,13 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1367,6 +1675,13 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1397,6 +1712,13 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1427,6 +1749,13 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1458,6 +1787,13 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1487,6 +1823,13 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1521,6 +1864,13 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1551,6 +1901,13 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1582,6 +1939,13 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1611,6 +1975,13 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1641,6 +2012,13 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1671,6 +2049,13 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1702,6 +2087,13 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1731,6 +2123,13 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1762,6 +2161,13 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -1789,6 +2195,13 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1816,6 +2229,13 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -1841,6 +2261,13 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1868,6 +2295,13 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -1895,6 +2329,13 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1922,6 +2363,13 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -1947,6 +2395,13 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1978,6 +2433,13 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2005,6 +2467,13 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2032,6 +2501,13 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2057,6 +2533,13 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2084,6 +2567,13 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2111,6 +2601,13 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2138,6 +2635,13 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2163,6 +2667,13 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2194,6 +2705,13 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2221,6 +2739,13 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2248,6 +2773,13 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2273,6 +2805,13 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2300,6 +2839,13 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2327,6 +2873,13 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2354,6 +2907,13 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2379,6 +2939,13 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2410,6 +2977,13 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2437,6 +3011,13 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2464,6 +3045,13 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2489,6 +3077,13 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2516,6 +3111,13 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2543,6 +3145,13 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2570,6 +3179,13 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2595,6 +3211,13 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2632,6 +3255,15 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sba ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -2666,6 +3298,15 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2701,6 +3342,15 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sb ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -2733,6 +3383,15 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) in ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2769,6 +3428,16 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inre ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -2806,6 +3475,16 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace( ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2844,6 +3523,16 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sb ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -2879,6 +3568,16 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) in ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2902,6 +3601,12 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -2921,6 +3626,12 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2939,6 +3650,11 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -2955,6 +3671,11 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2974,6 +3695,12 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -2993,6 +3720,12 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3011,6 +3744,11 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -3027,6 +3765,11 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3051,6 +3794,12 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -3070,6 +3819,12 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3088,6 +3843,11 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -3104,6 +3864,11 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3123,6 +3888,12 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -3142,6 +3913,12 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3160,6 +3937,11 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -3176,6 +3958,11 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll new file mode 100644 index 000000000000..80d4fa69be42 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll @@ -0,0 +1,461 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 { +; GFX11-LABEL: group_image_sample: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: s_mov_b32 s33, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: s_mov_b32 m0, s4 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: s_mov_b32 s6, s3 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[6:7], 0x0 +; GFX11-NEXT: s_mov_b32 s16, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15 +; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15 +; GFX11-NEXT: s_mov_b32 exec_lo, s16 +; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10 +; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20 +; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30 +; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40 +; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x50 +; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x60 +; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0x70 +; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0x80 +; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0x90 +; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xa0 +; GFX11-NEXT: s_buffer_load_b64 s[38:39], s[12:15], 0xb0 +; GFX11-NEXT: s_buffer_load_b64 s[40:41], s[12:15], 0xc0 +; GFX11-NEXT: s_buffer_load_b64 s[42:43], s[12:15], 0xd0 +; GFX11-NEXT: s_buffer_load_b64 s[44:45], s[12:15], 0xe0 +; GFX11-NEXT: s_buffer_load_b64 s[46:47], s[12:15], 0xf0 +; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100 +; GFX11-NEXT: v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7 +; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v5, s17, v36 +; GFX11-NEXT: v_add_f32_e32 v4, s16, v0 +; GFX11-NEXT: v_add_f32_e32 v8, s18, v0 +; GFX11-NEXT: v_add_f32_e32 v9, s19, v36 +; GFX11-NEXT: v_add_f32_e32 v12, s20, v0 +; GFX11-NEXT: v_add_f32_e32 v13, s21, v36 +; GFX11-NEXT: v_add_f32_e32 v16, s22, v0 +; GFX11-NEXT: v_add_f32_e32 v17, s23, v36 +; GFX11-NEXT: v_add_f32_e32 v20, s24, v0 +; GFX11-NEXT: v_add_f32_e32 v21, s25, v36 +; GFX11-NEXT: v_add_f32_e32 v24, s26, v0 +; GFX11-NEXT: v_add_f32_e32 v25, s27, v36 +; GFX11-NEXT: v_add_f32_e32 v28, s28, v0 +; GFX11-NEXT: v_add_f32_e32 v29, s29, v36 +; GFX11-NEXT: v_add_f32_e32 v32, s30, v0 +; GFX11-NEXT: v_add_f32_e32 v33, s31, v36 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: v_add_f32_e32 v37, s34, v0 +; GFX11-NEXT: v_add_f32_e32 v38, s35, v36 +; GFX11-NEXT: v_add_f32_e32 v40, s36, v0 +; GFX11-NEXT: v_add_f32_e32 v41, s37, v36 +; GFX11-NEXT: v_add_f32_e32 v44, s38, v0 +; GFX11-NEXT: v_add_f32_e32 v45, s39, v36 +; GFX11-NEXT: v_add_f32_e32 v48, s40, v0 +; GFX11-NEXT: v_add_f32_e32 v49, s41, v36 +; GFX11-NEXT: v_add_f32_e32 v52, s42, v0 +; GFX11-NEXT: v_add_f32_e32 v53, s43, v36 +; GFX11-NEXT: v_add_f32_e32 v56, s44, v0 +; GFX11-NEXT: v_add_f32_e32 v57, s45, v36 +; GFX11-NEXT: v_add_f32_e32 v60, s46, v0 +; GFX11-NEXT: v_add_f32_e32 v61, s47, v36 +; GFX11-NEXT: v_add_f32_e32 v0, s12, v0 +; GFX11-NEXT: v_add_f32_e32 v1, s13, v36 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s33 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_dual_add_f32 v0, v8, v4 :: v_dual_add_f32 v1, v9, v5 +; GFX11-NEXT: v_dual_add_f32 v4, v10, v6 :: v_dual_add_f32 v5, v11, v7 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v12, v0 :: v_dual_add_f32 v1, v13, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v14, v4 :: v_dual_add_f32 v5, v15, v5 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v16, v0 :: v_dual_add_f32 v1, v17, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v18, v4 :: v_dual_add_f32 v5, v19, v5 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v20, v0 :: v_dual_add_f32 v1, v21, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v22, v4 :: v_dual_add_f32 v5, v23, v5 +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v24, v0 :: v_dual_add_f32 v1, v25, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v26, v4 :: v_dual_add_f32 v5, v27, v5 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v28, v0 :: v_dual_add_f32 v1, v29, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v30, v4 :: v_dual_add_f32 v5, v31, v5 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v32, v0 :: v_dual_add_f32 v1, v33, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v34, v4 :: v_dual_add_f32 v5, v35, v5 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v36, v0 :: v_dual_add_f32 v1, v37, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v38, v4 :: v_dual_add_f32 v5, v39, v5 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v40, v0 :: v_dual_add_f32 v1, v41, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v42, v4 :: v_dual_add_f32 v5, v43, v5 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v44, v0 :: v_dual_add_f32 v1, v45, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v46, v4 :: v_dual_add_f32 v5, v47, v5 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v48, v0 :: v_dual_add_f32 v1, v49, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v50, v4 :: v_dual_add_f32 v5, v51, v5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v52, v0 :: v_dual_add_f32 v1, v53, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v54, v4 :: v_dual_add_f32 v5, v55, v5 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v56, v0 :: v_dual_add_f32 v1, v57, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v58, v4 :: v_dual_add_f32 v5, v59, v5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v60, v0 :: v_dual_add_f32 v1, v61, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v62, v4 :: v_dual_add_f32 v5, v63, v5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v0, v64, v0 :: v_dual_add_f32 v1, v65, v1 +; GFX11-NEXT: v_dual_add_f32 v4, v66, v4 :: v_dual_add_f32 v5, v67, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v4, v5 +; GFX11-NEXT: exp mrt0 v0, v1, off, off done +; GFX11-NEXT: s_endpgm +.entry: + %i = call i64 @llvm.amdgcn.s.getpc() + %i1 = and i64 %i, -4294967296 + %i2 = zext i32 %userdata6 to i64 + %i3 = or disjoint i64 %i1, %i2 + %i4 = inttoptr i64 %i3 to ptr addrspace(4) + %i5 = load <4 x i32>, ptr addrspace(4) %i4, align 16 + %i6 = zext i32 %userdata7 to i64 + %i7 = or disjoint i64 %i1, %i6 + %i8 = inttoptr i64 %i7 to ptr addrspace(4) + %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 4, !invariant.load !0 + %i10 = zext i32 %userdata8 to i64 + %i11 = or disjoint i64 %i1, %i10 + %i12 = inttoptr i64 %i11 to ptr addrspace(4) + %i13 = load <8 x i32>, ptr addrspace(4) %i12, align 4, !invariant.load !0 + %i14 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %PrimMask) + %PerspInterpCenter.i1 = extractelement <2 x float> %PerspInterpCenter, i64 1 + %PerspInterpCenter.i0 = extractelement <2 x float> %PerspInterpCenter, i64 0 + %i15 = call float @llvm.amdgcn.interp.inreg.p10(float %i14, float %PerspInterpCenter.i0, float %i14) + %i16 = call float @llvm.amdgcn.interp.inreg.p2(float %i14, float %PerspInterpCenter.i1, float %i15) + %i17 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %PrimMask) + %i18 = call float @llvm.amdgcn.interp.inreg.p10(float %i17, float %PerspInterpCenter.i0, float %i17) + %i19 = call float @llvm.amdgcn.interp.inreg.p2(float %i17, float %PerspInterpCenter.i1, float %i18) + %i20 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 16, i32 0), !invariant.load !0 + %i21 = shufflevector <2 x i32> %i20, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i22 = bitcast <4 x i32> %i21 to <4 x float> + %.i0 = extractelement <4 x float> %i22, i64 0 + %.i1 = extractelement <4 x float> %i22, i64 1 + %.i03 = fadd reassoc nnan nsz arcp contract afn float %.i0, %i19 + %.i14 = fadd reassoc nnan nsz arcp contract afn float %.i1, %i16 + %i23 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i03, float %.i14, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i010 = extractelement <4 x float> %i23, i64 0 + %.i113 = extractelement <4 x float> %i23, i64 1 + %.i215 = extractelement <4 x float> %i23, i64 2 + %.i317 = extractelement <4 x float> %i23, i64 3 + %i24 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 32, i32 0), !invariant.load !0 + %i25 = shufflevector <2 x i32> %i24, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i26 = bitcast <4 x i32> %i25 to <4 x float> + %.i05 = extractelement <4 x float> %i26, i64 0 + %.i16 = extractelement <4 x float> %i26, i64 1 + %.i07 = fadd reassoc nnan nsz arcp contract afn float %.i05, %i19 + %.i18 = fadd reassoc nnan nsz arcp contract afn float %.i16, %i16 + %i27 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i07, float %.i18, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i09 = extractelement <4 x float> %i27, i64 0 + %.i011 = fadd reassoc nnan nsz arcp contract afn float %.i09, %.i010 + %.i112 = extractelement <4 x float> %i27, i64 1 + %.i114 = fadd reassoc nnan nsz arcp contract afn float %.i112, %.i113 + %.i2 = extractelement <4 x float> %i27, i64 2 + %.i216 = fadd reassoc nnan nsz arcp contract afn float %.i2, %.i215 + %.i3 = extractelement <4 x float> %i27, i64 3 + %.i318 = fadd reassoc nnan nsz arcp contract afn float %.i3, %.i317 + %i28 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 48, i32 0), !invariant.load !0 + %i29 = shufflevector <2 x i32> %i28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i30 = bitcast <4 x i32> %i29 to <4 x float> + %.i019 = extractelement <4 x float> %i30, i64 0 + %.i120 = extractelement <4 x float> %i30, i64 1 + %.i021 = fadd reassoc nnan nsz arcp contract afn float %.i019, %i19 + %.i122 = fadd reassoc nnan nsz arcp contract afn float %.i120, %i16 + %i31 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i021, float %.i122, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i023 = extractelement <4 x float> %i31, i64 0 + %.i024 = fadd reassoc nnan nsz arcp contract afn float %.i023, %.i011 + %.i125 = extractelement <4 x float> %i31, i64 1 + %.i126 = fadd reassoc nnan nsz arcp contract afn float %.i125, %.i114 + %.i227 = extractelement <4 x float> %i31, i64 2 + %.i228 = fadd reassoc nnan nsz arcp contract afn float %.i227, %.i216 + %.i329 = extractelement <4 x float> %i31, i64 3 + %.i330 = fadd reassoc nnan nsz arcp contract afn float %.i329, %.i318 + %i32 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 64, i32 0), !invariant.load !0 + %i33 = shufflevector <2 x i32> %i32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i34 = bitcast <4 x i32> %i33 to <4 x float> + %.i031 = extractelement <4 x float> %i34, i64 0 + %.i132 = extractelement <4 x float> %i34, i64 1 + %.i033 = fadd reassoc nnan nsz arcp contract afn float %.i031, %i19 + %.i134 = fadd reassoc nnan nsz arcp contract afn float %.i132, %i16 + %i35 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i033, float %.i134, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i035 = extractelement <4 x float> %i35, i64 0 + %.i036 = fadd reassoc nnan nsz arcp contract afn float %.i035, %.i024 + %.i137 = extractelement <4 x float> %i35, i64 1 + %.i138 = fadd reassoc nnan nsz arcp contract afn float %.i137, %.i126 + %.i239 = extractelement <4 x float> %i35, i64 2 + %.i240 = fadd reassoc nnan nsz arcp contract afn float %.i239, %.i228 + %.i341 = extractelement <4 x float> %i35, i64 3 + %.i342 = fadd reassoc nnan nsz arcp contract afn float %.i341, %.i330 + %i36 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 80, i32 0), !invariant.load !0 + %i37 = shufflevector <2 x i32> %i36, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i38 = bitcast <4 x i32> %i37 to <4 x float> + %.i043 = extractelement <4 x float> %i38, i64 0 + %.i144 = extractelement <4 x float> %i38, i64 1 + %.i045 = fadd reassoc nnan nsz arcp contract afn float %.i043, %i19 + %.i146 = fadd reassoc nnan nsz arcp contract afn float %.i144, %i16 + %i39 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i045, float %.i146, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i047 = extractelement <4 x float> %i39, i64 0 + %.i048 = fadd reassoc nnan nsz arcp contract afn float %.i047, %.i036 + %.i149 = extractelement <4 x float> %i39, i64 1 + %.i150 = fadd reassoc nnan nsz arcp contract afn float %.i149, %.i138 + %.i251 = extractelement <4 x float> %i39, i64 2 + %.i252 = fadd reassoc nnan nsz arcp contract afn float %.i251, %.i240 + %.i353 = extractelement <4 x float> %i39, i64 3 + %.i354 = fadd reassoc nnan nsz arcp contract afn float %.i353, %.i342 + %i40 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 96, i32 0), !invariant.load !0 + %i41 = shufflevector <2 x i32> %i40, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i42 = bitcast <4 x i32> %i41 to <4 x float> + %.i055 = extractelement <4 x float> %i42, i64 0 + %.i156 = extractelement <4 x float> %i42, i64 1 + %.i057 = fadd reassoc nnan nsz arcp contract afn float %.i055, %i19 + %.i158 = fadd reassoc nnan nsz arcp contract afn float %.i156, %i16 + %i43 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i057, float %.i158, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i059 = extractelement <4 x float> %i43, i64 0 + %.i060 = fadd reassoc nnan nsz arcp contract afn float %.i059, %.i048 + %.i161 = extractelement <4 x float> %i43, i64 1 + %.i162 = fadd reassoc nnan nsz arcp contract afn float %.i161, %.i150 + %.i263 = extractelement <4 x float> %i43, i64 2 + %.i264 = fadd reassoc nnan nsz arcp contract afn float %.i263, %.i252 + %.i365 = extractelement <4 x float> %i43, i64 3 + %.i366 = fadd reassoc nnan nsz arcp contract afn float %.i365, %.i354 + %i44 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 112, i32 0), !invariant.load !0 + %i45 = shufflevector <2 x i32> %i44, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i46 = bitcast <4 x i32> %i45 to <4 x float> + %.i067 = extractelement <4 x float> %i46, i64 0 + %.i168 = extractelement <4 x float> %i46, i64 1 + %.i069 = fadd reassoc nnan nsz arcp contract afn float %.i067, %i19 + %.i170 = fadd reassoc nnan nsz arcp contract afn float %.i168, %i16 + %i47 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i069, float %.i170, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i071 = extractelement <4 x float> %i47, i64 0 + %.i072 = fadd reassoc nnan nsz arcp contract afn float %.i071, %.i060 + %.i173 = extractelement <4 x float> %i47, i64 1 + %.i174 = fadd reassoc nnan nsz arcp contract afn float %.i173, %.i162 + %.i275 = extractelement <4 x float> %i47, i64 2 + %.i276 = fadd reassoc nnan nsz arcp contract afn float %.i275, %.i264 + %.i377 = extractelement <4 x float> %i47, i64 3 + %.i378 = fadd reassoc nnan nsz arcp contract afn float %.i377, %.i366 + %i48 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 128, i32 0), !invariant.load !0 + %i49 = shufflevector <2 x i32> %i48, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i50 = bitcast <4 x i32> %i49 to <4 x float> + %.i079 = extractelement <4 x float> %i50, i64 0 + %.i180 = extractelement <4 x float> %i50, i64 1 + %.i081 = fadd reassoc nnan nsz arcp contract afn float %.i079, %i19 + %.i182 = fadd reassoc nnan nsz arcp contract afn float %.i180, %i16 + %i51 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i081, float %.i182, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i083 = extractelement <4 x float> %i51, i64 0 + %.i084 = fadd reassoc nnan nsz arcp contract afn float %.i083, %.i072 + %.i185 = extractelement <4 x float> %i51, i64 1 + %.i186 = fadd reassoc nnan nsz arcp contract afn float %.i185, %.i174 + %.i287 = extractelement <4 x float> %i51, i64 2 + %.i288 = fadd reassoc nnan nsz arcp contract afn float %.i287, %.i276 + %.i389 = extractelement <4 x float> %i51, i64 3 + %.i390 = fadd reassoc nnan nsz arcp contract afn float %.i389, %.i378 + %i52 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 144, i32 0), !invariant.load !0 + %i53 = shufflevector <2 x i32> %i52, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i54 = bitcast <4 x i32> %i53 to <4 x float> + %.i091 = extractelement <4 x float> %i54, i64 0 + %.i192 = extractelement <4 x float> %i54, i64 1 + %.i093 = fadd reassoc nnan nsz arcp contract afn float %.i091, %i19 + %.i194 = fadd reassoc nnan nsz arcp contract afn float %.i192, %i16 + %i55 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i093, float %.i194, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i095 = extractelement <4 x float> %i55, i64 0 + %.i096 = fadd reassoc nnan nsz arcp contract afn float %.i095, %.i084 + %.i197 = extractelement <4 x float> %i55, i64 1 + %.i198 = fadd reassoc nnan nsz arcp contract afn float %.i197, %.i186 + %.i299 = extractelement <4 x float> %i55, i64 2 + %.i2100 = fadd reassoc nnan nsz arcp contract afn float %.i299, %.i288 + %.i3101 = extractelement <4 x float> %i55, i64 3 + %.i3102 = fadd reassoc nnan nsz arcp contract afn float %.i3101, %.i390 + %i56 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 160, i32 0), !invariant.load !0 + %i57 = shufflevector <2 x i32> %i56, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i58 = bitcast <4 x i32> %i57 to <4 x float> + %.i0103 = extractelement <4 x float> %i58, i64 0 + %.i1104 = extractelement <4 x float> %i58, i64 1 + %.i0105 = fadd reassoc nnan nsz arcp contract afn float %.i0103, %i19 + %.i1106 = fadd reassoc nnan nsz arcp contract afn float %.i1104, %i16 + %i59 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0105, float %.i1106, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i0107 = extractelement <4 x float> %i59, i64 0 + %.i0108 = fadd reassoc nnan nsz arcp contract afn float %.i0107, %.i096 + %.i1109 = extractelement <4 x float> %i59, i64 1 + %.i1110 = fadd reassoc nnan nsz arcp contract afn float %.i1109, %.i198 + %.i2111 = extractelement <4 x float> %i59, i64 2 + %.i2112 = fadd reassoc nnan nsz arcp contract afn float %.i2111, %.i2100 + %.i3113 = extractelement <4 x float> %i59, i64 3 + %.i3114 = fadd reassoc nnan nsz arcp contract afn float %.i3113, %.i3102 + %i60 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 176, i32 0), !invariant.load !0 + %i61 = shufflevector <2 x i32> %i60, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i62 = bitcast <4 x i32> %i61 to <4 x float> + %.i0115 = extractelement <4 x float> %i62, i64 0 + %.i1116 = extractelement <4 x float> %i62, i64 1 + %.i0117 = fadd reassoc nnan nsz arcp contract afn float %.i0115, %i19 + %.i1118 = fadd reassoc nnan nsz arcp contract afn float %.i1116, %i16 + %i63 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0117, float %.i1118, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i0119 = extractelement <4 x float> %i63, i64 0 + %.i0120 = fadd reassoc nnan nsz arcp contract afn float %.i0119, %.i0108 + %.i1121 = extractelement <4 x float> %i63, i64 1 + %.i1122 = fadd reassoc nnan nsz arcp contract afn float %.i1121, %.i1110 + %.i2123 = extractelement <4 x float> %i63, i64 2 + %.i2124 = fadd reassoc nnan nsz arcp contract afn float %.i2123, %.i2112 + %.i3125 = extractelement <4 x float> %i63, i64 3 + %.i3126 = fadd reassoc nnan nsz arcp contract afn float %.i3125, %.i3114 + %i64 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 192, i32 0), !invariant.load !0 + %i65 = shufflevector <2 x i32> %i64, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i66 = bitcast <4 x i32> %i65 to <4 x float> + %.i0127 = extractelement <4 x float> %i66, i64 0 + %.i1128 = extractelement <4 x float> %i66, i64 1 + %.i0129 = fadd reassoc nnan nsz arcp contract afn float %.i0127, %i19 + %.i1130 = fadd reassoc nnan nsz arcp contract afn float %.i1128, %i16 + %i67 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0129, float %.i1130, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i0131 = extractelement <4 x float> %i67, i64 0 + %.i0132 = fadd reassoc nnan nsz arcp contract afn float %.i0131, %.i0120 + %.i1133 = extractelement <4 x float> %i67, i64 1 + %.i1134 = fadd reassoc nnan nsz arcp contract afn float %.i1133, %.i1122 + %.i2135 = extractelement <4 x float> %i67, i64 2 + %.i2136 = fadd reassoc nnan nsz arcp contract afn float %.i2135, %.i2124 + %.i3137 = extractelement <4 x float> %i67, i64 3 + %.i3138 = fadd reassoc nnan nsz arcp contract afn float %.i3137, %.i3126 + %i68 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 208, i32 0), !invariant.load !0 + %i69 = shufflevector <2 x i32> %i68, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i70 = bitcast <4 x i32> %i69 to <4 x float> + %.i0139 = extractelement <4 x float> %i70, i64 0 + %.i1140 = extractelement <4 x float> %i70, i64 1 + %.i0141 = fadd reassoc nnan nsz arcp contract afn float %.i0139, %i19 + %.i1142 = fadd reassoc nnan nsz arcp contract afn float %.i1140, %i16 + %i71 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0141, float %.i1142, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i0143 = extractelement <4 x float> %i71, i64 0 + %.i0144 = fadd reassoc nnan nsz arcp contract afn float %.i0143, %.i0132 + %.i1145 = extractelement <4 x float> %i71, i64 1 + %.i1146 = fadd reassoc nnan nsz arcp contract afn float %.i1145, %.i1134 + %.i2147 = extractelement <4 x float> %i71, i64 2 + %.i2148 = fadd reassoc nnan nsz arcp contract afn float %.i2147, %.i2136 + %.i3149 = extractelement <4 x float> %i71, i64 3 + %.i3150 = fadd reassoc nnan nsz arcp contract afn float %.i3149, %.i3138 + %i72 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 224, i32 0), !invariant.load !0 + %i73 = shufflevector <2 x i32> %i72, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i74 = bitcast <4 x i32> %i73 to <4 x float> + %.i0151 = extractelement <4 x float> %i74, i64 0 + %.i1152 = extractelement <4 x float> %i74, i64 1 + %.i0153 = fadd reassoc nnan nsz arcp contract afn float %.i0151, %i19 + %.i1154 = fadd reassoc nnan nsz arcp contract afn float %.i1152, %i16 + %i75 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0153, float %.i1154, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i0155 = extractelement <4 x float> %i75, i64 0 + %.i0156 = fadd reassoc nnan nsz arcp contract afn float %.i0155, %.i0144 + %.i1157 = extractelement <4 x float> %i75, i64 1 + %.i1158 = fadd reassoc nnan nsz arcp contract afn float %.i1157, %.i1146 + %.i2159 = extractelement <4 x float> %i75, i64 2 + %.i2160 = fadd reassoc nnan nsz arcp contract afn float %.i2159, %.i2148 + %.i3161 = extractelement <4 x float> %i75, i64 3 + %.i3162 = fadd reassoc nnan nsz arcp contract afn float %.i3161, %.i3150 + %i76 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 240, i32 0), !invariant.load !0 + %i77 = shufflevector <2 x i32> %i76, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i78 = bitcast <4 x i32> %i77 to <4 x float> + %.i0163 = extractelement <4 x float> %i78, i64 0 + %.i1164 = extractelement <4 x float> %i78, i64 1 + %.i0165 = fadd reassoc nnan nsz arcp contract afn float %.i0163, %i19 + %.i1166 = fadd reassoc nnan nsz arcp contract afn float %.i1164, %i16 + %i79 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0165, float %.i1166, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i0167 = extractelement <4 x float> %i79, i64 0 + %.i0168 = fadd reassoc nnan nsz arcp contract afn float %.i0167, %.i0156 + %.i1169 = extractelement <4 x float> %i79, i64 1 + %.i1170 = fadd reassoc nnan nsz arcp contract afn float %.i1169, %.i1158 + %.i2171 = extractelement <4 x float> %i79, i64 2 + %.i2172 = fadd reassoc nnan nsz arcp contract afn float %.i2171, %.i2160 + %.i3173 = extractelement <4 x float> %i79, i64 3 + %.i3174 = fadd reassoc nnan nsz arcp contract afn float %.i3173, %.i3162 + %i80 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 256, i32 0), !invariant.load !0 + %i81 = shufflevector <2 x i32> %i80, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i82 = bitcast <4 x i32> %i81 to <4 x float> + %.i0175 = extractelement <4 x float> %i82, i64 0 + %.i1176 = extractelement <4 x float> %i82, i64 1 + %.i0177 = fadd reassoc nnan nsz arcp contract afn float %.i0175, %i19 + %.i1178 = fadd reassoc nnan nsz arcp contract afn float %.i1176, %i16 + %i83 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0177, float %.i1178, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0) + %.i0179 = extractelement <4 x float> %i83, i64 0 + %.i0180 = fadd reassoc nnan nsz arcp contract afn float %.i0179, %.i0168 + %.i1181 = extractelement <4 x float> %i83, i64 1 + %.i1182 = fadd reassoc nnan nsz arcp contract afn float %.i1181, %.i1170 + %.i2183 = extractelement <4 x float> %i83, i64 2 + %.i2184 = fadd reassoc nnan nsz arcp contract afn float %.i2183, %.i2172 + %.i3185 = extractelement <4 x float> %i83, i64 3 + %.i3186 = fadd reassoc nnan nsz arcp contract afn float %.i3185, %.i3174 + %i84 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i0180, float %.i1182) + %i85 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i2184, float %.i3186) + %i86 = bitcast <2 x half> %i84 to float + %i87 = bitcast <2 x half> %i85 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float %i86, float %i87, float poison, float poison, i1 true, i1 true) + ret void +} + +declare noundef i64 @llvm.amdgcn.s.getpc() #3 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #5 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 +declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #4 +declare float @llvm.amdgcn.lds.param.load(i32 immarg, i32 immarg, i32) #3 +declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3 +declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3 +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8 + +attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "amdgpu-max-memory-cluster-dwords"="32"} +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) } +attributes #8 = { nocallback nofree nosync nounwind willreturn memory(none) } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir index 6a25e346c894..49576433ab54 100644 --- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir @@ -1,121 +1,140 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s --- -# GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop1 -# GCN: V_CMPX_EQ_I32_e32 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vopc_write_exec_permlane16_swap_vop1 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop1 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vop3_write_exec_permlane16_swap_vop1 -# GCN: V_CMPX_EQ_I32_e64 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vop3_write_exec_permlane16_swap_vop1 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vop3_write_exec_permlane16_swap_vop1 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop3 -# GCN: V_CMPX_EQ_I32_e32 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vopc_write_exec_permlane16_swap_vop3 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop3 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vop3_write_exec_permlane16_swap_vop3 -# GCN: V_CMPX_EQ_I32_e64 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vop3_write_exec_permlane16_swap_vop3 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vop3_write_exec_permlane16_swap_vop3 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vopc_write_exec_permlane32_swap_vop1 -# GCN: V_CMPX_EQ_I32_e32 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vopc_write_exec_permlane32_swap_vop1 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vopc_write_exec_permlane32_swap_vop1 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vop3_write_exec_permlane32_swap_vop1 -# GCN: V_CMPX_EQ_I32_e64 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vop3_write_exec_permlane32_swap_vop1 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vop3_write_exec_permlane32_swap_vop1 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vopc_write_exec_permlane32_swap_vop3 -# GCN: V_CMPX_EQ_I32_e32 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vopc_write_exec_permlane32_swap_vop3 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vopc_write_exec_permlane32_swap_vop3 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vop3_write_exec_permlane32_swap_vop3 -# GCN: V_CMPX_EQ_I32_e64 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: vcmpx_vop3_write_exec_permlane32_swap_vop3 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vop3_write_exec_permlane32_swap_vop3 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec $exec = V_CMPX_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop1__nowait -# GCN: V_CMPX_EQ_I32_e32 -# GCN-NEXT: V_MOV_B32 -# GCN-NEXT: V_MOV_B32 -# GCN-NEXT: V_MOV_B32 -# GCN-NEXT: V_MOV_B32 -# GCN-NEXT: V_PERMLANE name: vcmpx_vopc_write_exec_permlane16_swap_vop1__nowait body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop1__nowait + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec $vgpr2 = V_MOV_B32_e32 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -125,17 +144,19 @@ body: | ... --- -# GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop1__wait1 -# GCN: V_CMPX_EQ_I32_e32 -# GCN-NEXT: V_MOV_B32 -# GCN-NEXT: V_MOV_B32 -# GCN-NEXT: V_MOV_B32 -# GCN-NEXT: S_NOP 0 -# GCN-NEXT: V_PERMLANE name: vcmpx_vopc_write_exec_permlane16_swap_vop1__wait1 body: | bb.0: liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: vcmpx_vopc_write_exec_permlane16_swap_vop1__wait1 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit-def $exec, implicit-def $vcc, implicit $exec $vgpr2 = V_MOV_B32_e32 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -144,112 +165,128 @@ body: | ... --- -# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0 -# GCN: V_MOV_B32 -# GCN-NEXT: S_NOP 1 -# GCN-NEXT: V_PERMLANE name: valu_write_vdst_read_permlane16_swap_0 body: | bb.0: liveins: $vgpr1 + ; GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0 + ; GCN: liveins: $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec $vgpr0 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- -# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1 -# GCN: V_MOV_B32 -# GCN-NEXT: S_NOP 1 -# GCN-NEXT: V_PERMLANE name: valu_write_vdst_read_permlane16_swap_1 body: | bb.0: liveins: $vgpr0 + ; GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- -# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0 -# GCN: V_MOV_B32 -# GCN-NEXT: S_NOP 1 -# GCN-NEXT: V_PERMLANE name: valu_write_vdst_read_permlane32_swap_0 body: | bb.0: liveins: $vgpr1 + ; GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0 + ; GCN: liveins: $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec $vgpr0 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- -# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1 -# GCN: V_MOV_B32 -# GCN-NEXT: S_NOP 1 -# GCN-NEXT: V_PERMLANE name: valu_write_vdst_read_permlane32_swap_1 body: | bb.0: liveins: $vgpr0 + ; GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- # No hazard, write of other register -# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg -# GCN: V_MOV_B32 -# GCN-NEXT: V_PERMLANE name: valu_write_vdst_read_permlane16_swap_0_otherreg body: | bb.0: liveins: $vgpr1 + ; GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg + ; GCN: liveins: $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec $vgpr2 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec ... --- # Both permlane hazards at once. -# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap -# GCN: V_MOV_B32 -# GCN: V_CMPX_EQ_I32 -# GCN-NEXT: S_NOP 3 -# GCN-NEXT: V_PERMLANE name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap body: | bb.0: liveins: $vgpr0, $vgpr2, $vgpr3 + ; GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap + ; GCN: liveins: $vgpr0, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec + ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap -# GCN: V_CMPX_EQ_I32 -# GCN: V_MOV_B32 -# GCN-NEXT: S_NOP 2 -# GCN-NEXT: V_PERMLANE name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap body: | bb.0: liveins: $vgpr0, $vgpr2, $vgpr3 + ; GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap + ; GCN: liveins: $vgpr0, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_NOP 2 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... --- -# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap -# GCN: V_CMPX_EQ_I32 -# GCN: V_MOV_B32 -# GCN: V_MOV_B32 -# GCN-NEXT: S_NOP 1 -# GCN-NEXT: V_PERMLANE name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap body: | bb.0: liveins: $vgpr0, $vgpr2, $vgpr3 + ; GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap + ; GCN: liveins: $vgpr0, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec $vgpr0 = V_MOV_B32_e32 0, implicit $exec @@ -258,13 +295,18 @@ body: | --- -# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_bf16_hazard -# GCN: V_CVT_SCALEF32_SR_FP8_BF16_e64 -# GCN: GLOBAL_STORE_DWORD name: test_cvt_scalef32_sr_fp8_bf16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_sr_fp8_bf16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec @@ -273,13 +315,18 @@ body: | ... --- -# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f16_hazard -# GCN: V_CVT_SCALEF32_SR_FP8_F16_e64 -# GCN: GLOBAL_STORE_DWORD name: test_cvt_scalef32_sr_fp8_f16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec @@ -288,14 +335,20 @@ body: | ... --- -# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f32_hazard -# GCN: V_CVT_SCALEF32_SR_FP8_F32_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 name: test_cvt_scalef32_sr_fp8_f32_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f32_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr2 = V_ADD_U32_e32 4, killed $vgpr5, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec @@ -305,14 +358,20 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_hazard -# GCN: V_CVT_SCALEF32_PK_FP8_F32_e64 -# GCN: S_NOP 0 -# GCN: V_PK_ADD_U16 name: test_cvt_scalef32_pk_fp8_f32_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp8_f32_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec @@ -322,14 +381,18 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_hazard -# GCN: V_CVT_SCALEF32_PK_FP8_F16_e64 -# GCN: S_NOP 0 -# GCN: V_PK_ADD_U16 name: test_cvt_scalef32_pk_fp8_f16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp8_f16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec @@ -337,14 +400,20 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_hazard -# GCN: V_CVT_SCALEF32_SR_BF8_BF16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 name: test_cvt_scalef32_pk_fp8_bf16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp8_bf16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 @@ -354,14 +423,20 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_sr_bf8_f16_hazard -# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 name: test_cvt_scalef32_sr_bf8_f16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_sr_bf8_f16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 @@ -371,14 +446,20 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_sr_bf8_f32_hazard -# GCN: V_CVT_SCALEF32_SR_BF8_F32_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 name: test_cvt_scalef32_sr_bf8_f32_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_sr_bf8_f32_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 @@ -388,14 +469,20 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_hazard -# GCN: V_CVT_SCALEF32_PK_BF8_F32_e64 -# GCN: S_NOP 0 -# GCN: V_PK_ADD_U16 name: test_cvt_scalef32_pk_bf8_f32_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scalef32_pk_bf8_f32_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec @@ -405,14 +492,18 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_hazard -# GCN: V_CVT_SCALEF32_PK_BF8_F16_e64 -# GCN: S_NOP 0 -# GCN: V_PK_ADD_U16 name: test_cvt_scalef32_pk_bf8_f16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_bf8_f16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec @@ -420,14 +511,18 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_hazard -# GCN: V_CVT_SCALEF32_PK_BF8_BF16_e64 -# GCN: S_NOP 0 -# GCN: V_PK_ADD_U16 name: test_cvt_scalef32_pk_bf8_bf16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_bf8_bf16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_BF16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_BF16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec @@ -435,14 +530,42 @@ body: | ... --- -# GCN-LABEL: test_cvt_scale_fp4_f32_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_F32_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 -name: test_cvt_scale_fp4_f32_hazard +name: test_cvt_scalef32_pk_fp4_f32_neg_hazard_opsel0 body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f32_neg_hazard_opsel0 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 0, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 0, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_f32_opsel3_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f32_opsel3_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 4, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec @@ -452,14 +575,64 @@ body: | ... --- -# GCN-LABEL: test_scalef32_sr_pk_fp4_f16_hazard -# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_pk_fp4_f32_opsel0_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f32_opsel0_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 0, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr1 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 0, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr1 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_f32_opsel3_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f32_opsel3_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 4, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr1 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 4, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr1 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- name: test_scalef32_sr_pk_fp4_f16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 @@ -469,31 +642,133 @@ body: | ... --- -# GCN-LABEL: test_scalef32_sr_pk_fp4_bf16_hazard -# GCN: V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 -name: test_scalef32_sr_pk_fp4_bf16_hazard +name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_scalef32_sr_pk_fp4_bf16_opsel3_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel3_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_scalef32_sr_pk_fp4_bf16_opsel0_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel0_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_scalef32_sr_pk_fp4_bf16_opsel3_neg_fp4_as_src_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel3_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 ... --- -# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_hazard -# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 -name: test_scalef32_sr_pk_fp4_f32_hazard +name: test_scalef32_sr_pk_fp4_f32_opsel3_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel3_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 8, killed $vgpr2_vgpr3, 0, killed $vgpr4, 4, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 @@ -503,29 +778,153 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_fp4_f16_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 -name: test_cvt_scalef32_fp4_f16_hazard +name: test_scalef32_sr_pk_fp4_f32_opsel0_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel0_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_scalef32_sr_pk_fp4_f32_opsel3_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel3_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 8, killed $vgpr2_vgpr3, 0, killed $vgpr4, 4, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 8, killed $vgpr2_vgpr3, 0, killed $vgpr4, 4, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_f16_neg_opsel0_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f16_neg_opsel0_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_f16_opsel3_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f16_opsel3_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_f16_opsel0_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f16_opsel0_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_f16_opsel3_neg_fp4_as_src_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_f16_opsel3_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_bf16_neg_opsel0_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_bf16_neg_opsel0_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 ... --- -# GCN-LABEL: test_cvt_scalef32_fp4_bf16_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_BF16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 -name: test_cvt_scalef32_fp4_bf16_hazard +name: test_cvt_scalef32_pk_fp4_bf16_opsel3_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_bf16_opsel3_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec @@ -533,14 +932,57 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_hazard_skipping_over_meta_instr -# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_pk_fp4_bf16_opsel0_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_bf16_opsel0_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 0, killed $vgpr0, 0, killed $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scalef32_pk_fp4_bf16_opsel3_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_fp4_bf16_opsel3_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- name: test_cvt_scalef32_hazard_skipping_over_meta_instr body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_hazard_skipping_over_meta_instr + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = KILL + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 @@ -551,15 +993,18 @@ body: | ... --- -# GCN-LABEL: test_cvt_f16_to_fp4_to_f16_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 -# GCN: S_NOP 0 -# GCN: V_CVT_SCALEF32_PK_F16_FP4_e64 -# GCN: S_SETPC_B64_return name: test_cvt_f16_to_fp4_to_f16_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_f16_to_fp4_to_f16_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 4, killed $vgpr2, 0, killed $vgpr1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 4, killed $vgpr2, 0, killed $vgpr1, 0, implicit $mode, implicit $exec @@ -567,14 +1012,58 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_hazard_pseudo -# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64 -# GCN: S_NOP 0 -# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_pk_f16_fp4_opsel0_neg_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_f16_fp4_opsel0_neg_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 0, killed $vgpr2, 0, killed $vgpr1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 0, killed $vgpr2, 0, killed $vgpr1, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + + +--- +name: test_cvt_scalef32_pk_f16_fp4_opsel3_neg_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_pk_f16_fp4_opsel3_neg_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 4, killed $vgpr2, 4, killed $vgpr1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 4, killed $vgpr2, 4, killed $vgpr1, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- name: test_cvt_scalef32_hazard_pseudo body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-LABEL: name: test_cvt_scalef32_hazard_pseudo + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: WAVE_BARRIER + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec S_WAITCNT 3952 @@ -585,13 +1074,22 @@ body: | ... --- -# GCN-LABEL: test_call_consuming_cvt_scalef32_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 -# GCN: SI_CALL name: test_call_consuming_cvt_scalef32_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_call_consuming_cvt_scalef32_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1, implicit-def $sgpr0, implicit-def $sgpr0_lo16, implicit-def $sgpr0_hi16, implicit-def $sgpr1, implicit-def $sgpr1_lo16, implicit-def $sgpr1_hi16, implicit-def $scc { + ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: $sgpr0 = S_ADD_U32 internal $sgpr0, target-flags(amdgpu-gotprel32-lo) @test_cvt_scalef32_hazard_pseudo + 4, implicit-def $scc + ; GCN-NEXT: $sgpr1 = S_ADDC_U32 internal $sgpr1, target-flags(amdgpu-gotprel32-hi) @test_cvt_scalef32_hazard_pseudo + 12, implicit-def $scc, implicit internal $scc + ; GCN-NEXT: } + ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr0_sgpr1, @test_cvt_scalef32_hazard_pseudo, csr_amdgpu_gfx90ainsts, implicit undef $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit undef $sgpr10_sgpr11, implicit undef $sgpr12, implicit undef $sgpr13, implicit undef $sgpr14, implicit-def $sgpr15, implicit undef $vgpr31, implicit killed $vgpr2, implicit-def $vgpr2 + ; GCN-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 BUNDLE implicit-def $sgpr0_sgpr1, implicit-def $sgpr0, implicit-def $sgpr0_lo16, implicit-def $sgpr0_hi16, implicit-def $sgpr1, implicit-def $sgpr1_lo16, implicit-def $sgpr1_hi16, implicit-def $scc { $sgpr0_sgpr1 = S_GETPC_B64 $sgpr0 = S_ADD_U32 internal $sgpr0, target-flags(amdgpu-gotprel32-lo) @test_cvt_scalef32_hazard_pseudo + 4, implicit-def $scc @@ -604,14 +1102,18 @@ body: | ... --- -# GCN-LABEL: test_cvt_scalef32_inlineasm_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 -# GCN: S_NOP 0 -# GCN: INLINEASM name: test_cvt_scalef32_inlineasm_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-LABEL: name: test_cvt_scalef32_inlineasm_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, killed renamable $vgpr2 + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, killed renamable $vgpr2 @@ -619,16 +1121,42 @@ body: | ... --- -# GCN-LABEL: test_cvt_scale_cvt_scale_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 -# GCN: S_NOP 0 -# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64 -# GCN: S_NOP 0 -# GCN: S_SETPC_B64_return -name: test_cvt_scale_cvt_scale_hazard +name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel3_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel3_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec @@ -637,15 +1165,59 @@ body: | ... --- -# GCN-LABEL: test_cvt_scale_cvt_scale_waw_hazard -# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 -# GCN: S_NOP 0 -# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64 -# GCN: S_SETPC_B64_return +name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 0, $vgpr0, 0, $vgpr1, 0, killed $vgpr2, 0, implicit $mode, implicit $exec + early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel3_neg_fp4_as_src_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel3_neg_fp4_as_src_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- name: test_cvt_scale_cvt_scale_waw_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: test_cvt_scale_cvt_scale_waw_hazard + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: early-clobber renamable $vgpr2 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec early-clobber renamable $vgpr2 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr1, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir b/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir index 40089ed82b5d..99c27fa0bc95 100644 --- a/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir @@ -6,7 +6,7 @@ # check was inconsistent with a later assertion when the eviction was # performed. -# ERR: error: ran out of registers during register allocation +# ERR: error: <unknown>:0:0: ran out of registers during register allocation --- | define void @foo() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll index e37b6ff10ffa..3563e737f552 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_local_size_x( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 ; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4 ; GCN-NEXT: store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void @@ -25,7 +25,7 @@ define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @get_local_size_y(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_local_size_y( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14 +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14 ; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 2 ; GCN-NEXT: store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void @@ -46,7 +46,7 @@ define amdgpu_kernel void @get_local_size_y(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @get_local_size_z(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_local_size_z( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16 +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16 ; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4 ; GCN-NEXT: store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void @@ -106,7 +106,7 @@ define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @get_work_group_size_x(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_work_group_size_x( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 ; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4 ; GCN-NEXT: store i16 [[GROUP_SIZE_X]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void @@ -122,7 +122,7 @@ define amdgpu_kernel void @get_work_group_size_x(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @get_work_group_size_y(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_work_group_size_y( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14 +; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14 ; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 ; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void @@ -138,7 +138,7 @@ define amdgpu_kernel void @get_work_group_size_y(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @get_work_group_size_z(ptr addrspace(1) %out) #0 { ; GCN-LABEL: @get_work_group_size_z( ; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16 +; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16 ; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4 ; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT:%.*]], align 2 ; GCN-NEXT: ret void diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 7f334e0ca21e..3d27b5fe7f30 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -101,6 +101,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: flat_store_dword v[2:3], v0 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm + %flat.private = addrspacecast ptr addrspace(5) %ptr.private to ptr %flat.local = addrspacecast ptr addrspace(3) %ptr.local to ptr store volatile i32 1, ptr %flat.private diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll index 4c21f8729745..d5f45d70fb97 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll @@ -70,14 +70,32 @@ define void @use_everything_else() { } define amdgpu_kernel void @test_default_queue_offset_v4_0(ptr addrspace(1) %kernarg) { -; CHECK-LABEL: define {{[^@]+}}@test_default_queue_offset_v4_0 -; CHECK-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2:[0-9]+]] { -; CHECK-NEXT: call void @use_everything_else() -; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 32 -; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 -; CHECK-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 -; CHECK-NEXT: ret void +; V4-LABEL: define {{[^@]+}}@test_default_queue_offset_v4_0 +; V4-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2:[0-9]+]] { +; V4-NEXT: call void @use_everything_else() +; V4-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; V4-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 32 +; V4-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 +; V4-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 +; V4-NEXT: ret void +; +; V5-LABEL: define {{[^@]+}}@test_default_queue_offset_v4_0 +; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR1]] { +; V5-NEXT: call void @use_everything_else() +; V5-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; V5-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 32 +; V5-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 +; V5-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 +; V5-NEXT: ret void +; +; V6-LABEL: define {{[^@]+}}@test_default_queue_offset_v4_0 +; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR1]] { +; V6-NEXT: call void @use_everything_else() +; V6-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; V6-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 32 +; V6-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 +; V6-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 +; V6-NEXT: ret void ; call void @use_everything_else() %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -88,14 +106,32 @@ define amdgpu_kernel void @test_default_queue_offset_v4_0(ptr addrspace(1) %kern } define amdgpu_kernel void @test_default_queue_offset_v5_0(ptr addrspace(1) %kernarg) { -; CHECK-LABEL: define {{[^@]+}}@test_default_queue_offset_v5_0 -; CHECK-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR3:[0-9]+]] { -; CHECK-NEXT: call void @use_everything_else() -; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 104 -; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 -; CHECK-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 -; CHECK-NEXT: ret void +; V4-LABEL: define {{[^@]+}}@test_default_queue_offset_v5_0 +; V4-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR3:[0-9]+]] { +; V4-NEXT: call void @use_everything_else() +; V4-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; V4-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 104 +; V4-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 +; V4-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 +; V4-NEXT: ret void +; +; V5-LABEL: define {{[^@]+}}@test_default_queue_offset_v5_0 +; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2:[0-9]+]] { +; V5-NEXT: call void @use_everything_else() +; V5-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; V5-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 104 +; V5-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 +; V5-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 +; V5-NEXT: ret void +; +; V6-LABEL: define {{[^@]+}}@test_default_queue_offset_v5_0 +; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2:[0-9]+]] { +; V6-NEXT: call void @use_everything_else() +; V6-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; V6-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 104 +; V6-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 +; V6-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 +; V6-NEXT: ret void ; call void @use_everything_else() %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -116,7 +152,7 @@ define amdgpu_kernel void @test_completion_action_offset_v4_0(ptr addrspace(1) % ; V4-NEXT: ret void ; ; V5-LABEL: define {{[^@]+}}@test_completion_action_offset_v4_0 -; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2]] { +; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR1]] { ; V5-NEXT: call void @use_everything_else() ; V5-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; V5-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 40 @@ -125,7 +161,7 @@ define amdgpu_kernel void @test_completion_action_offset_v4_0(ptr addrspace(1) % ; V5-NEXT: ret void ; ; V6-LABEL: define {{[^@]+}}@test_completion_action_offset_v4_0 -; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2]] { +; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR1]] { ; V6-NEXT: call void @use_everything_else() ; V6-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; V6-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 40 @@ -142,32 +178,14 @@ define amdgpu_kernel void @test_completion_action_offset_v4_0(ptr addrspace(1) % } define amdgpu_kernel void @test_completion_action_offset_v5_0(ptr addrspace(1) %kernarg) { -; V4-LABEL: define {{[^@]+}}@test_completion_action_offset_v5_0 -; V4-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR3]] { -; V4-NEXT: call void @use_everything_else() -; V4-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; V4-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 112 -; V4-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 -; V4-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 -; V4-NEXT: ret void -; -; V5-LABEL: define {{[^@]+}}@test_completion_action_offset_v5_0 -; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR4:[0-9]+]] { -; V5-NEXT: call void @use_everything_else() -; V5-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; V5-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 112 -; V5-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 -; V5-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 -; V5-NEXT: ret void -; -; V6-LABEL: define {{[^@]+}}@test_completion_action_offset_v5_0 -; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR4:[0-9]+]] { -; V6-NEXT: call void @use_everything_else() -; V6-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; V6-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 112 -; V6-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 -; V6-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 -; V6-NEXT: ret void +; CHECK-LABEL: define {{[^@]+}}@test_completion_action_offset_v5_0 +; CHECK-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: call void @use_everything_else() +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 112 +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[GEP]], align 8 +; CHECK-NEXT: store ptr [[LOAD]], ptr addrspace(1) [[KERNARG]], align 8 +; CHECK-NEXT: ret void ; call void @use_everything_else() %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -188,7 +206,7 @@ define amdgpu_kernel void @test_default_queue_completion_action_offset_v3_0(ptr ; V4-NEXT: ret void ; ; V5-LABEL: define {{[^@]+}}@test_default_queue_completion_action_offset_v3_0 -; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2]] { +; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR1]] { ; V5-NEXT: call void @use_everything_else() ; V5-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; V5-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 32 @@ -197,7 +215,7 @@ define amdgpu_kernel void @test_default_queue_completion_action_offset_v3_0(ptr ; V5-NEXT: ret void ; ; V6-LABEL: define {{[^@]+}}@test_default_queue_completion_action_offset_v3_0 -; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR2]] { +; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR1]] { ; V6-NEXT: call void @use_everything_else() ; V6-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; V6-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 32 @@ -224,7 +242,7 @@ define amdgpu_kernel void @test_default_queue_completion_action_offset_v5_0(ptr ; V4-NEXT: ret void ; ; V5-LABEL: define {{[^@]+}}@test_default_queue_completion_action_offset_v5_0 -; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR5:[0-9]+]] { +; V5-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR4:[0-9]+]] { ; V5-NEXT: call void @use_everything_else() ; V5-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; V5-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 104 @@ -233,7 +251,7 @@ define amdgpu_kernel void @test_default_queue_completion_action_offset_v5_0(ptr ; V5-NEXT: ret void ; ; V6-LABEL: define {{[^@]+}}@test_default_queue_completion_action_offset_v5_0 -; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR5:[0-9]+]] { +; V6-SAME: (ptr addrspace(1) [[KERNARG:%.*]]) #[[ATTR4:[0-9]+]] { ; V6-NEXT: call void @use_everything_else() ; V6-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; V6-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 104 @@ -258,25 +276,23 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ;. ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll index f419d89a7f0a..a27cf3c18a70 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll @@ -68,6 +68,6 @@ if.end: ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index e7a7b8a335d0..22e3cc4b047b 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -6,14 +6,14 @@ ; GCN: define amdgpu_kernel void @caller(ptr addrspace(1) nocapture %p) local_unnamed_addr #1 { ; GCN: %mul.i = fmul float %load, 1.500000e+01 -; UNSAFE: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" } ; UNSAFE: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-infs-fp-math"="true" "uniform-work-group-size"="false" } -; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } +; NOINFS: attributes #0 = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; NONANS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-nans-fp-math"="true" "uniform-work-group-size"="false" } -; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } +; NONANS: attributes #0 = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } declare void @extern() #0 @@ -32,5 +32,5 @@ entry: ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"} attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 9f093cc7b5ab..26a4ea9d8a4b 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -230,49 +230,27 @@ entry: } define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { -; GFX67-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_imad_pat_i16: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_imad_pat_i16: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i16: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -337,11 +315,11 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -363,13 +341,13 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -400,13 +378,13 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -470,42 +448,40 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16: @@ -682,46 +658,43 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v6, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v7, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v3, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v5, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v1, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v4, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v3, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v3i16: @@ -1063,19 +1036,15 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v10, v5, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v9, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v11, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v9, v6, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v11, v7, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1085,60 +1054,60 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v10 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v3, v7 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v11, v11, v7, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v6 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v5, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v6, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v1, v9 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v2, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v4i16: @@ -1403,47 +1372,26 @@ entry: } define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { -; GFX67-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v3, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_umad_pat_i16: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_umad_pat_i16: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_mul_u32_u24_e32 v2, v0, v1 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v3, v2 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v1, v3, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_i16: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -1504,11 +1452,11 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1530,13 +1478,13 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1567,13 +1515,13 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1637,42 +1585,40 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16: @@ -1849,46 +1795,43 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v6, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v7, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v3, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v5, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v1, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v4, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v3, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v3i16: @@ -2230,19 +2173,15 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v10, v5, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v9, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v11, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v9, v6, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v11, v7, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2252,60 +2191,60 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v10 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v3, v7 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v11, v11, v7, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v6 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v5, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v6, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v1, v9 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v2, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v4i16: @@ -4282,49 +4221,27 @@ entry: } define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { -; GFX67-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_imad_pat_i8: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_imad_pat_i8: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i8: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -4389,11 +4306,11 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -4415,13 +4332,13 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 @@ -4452,13 +4369,13 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 @@ -4524,32 +4441,30 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v2, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i8: @@ -4655,20 +4570,18 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v4, v0, v2 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v5, v1, v3 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v4, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v5, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v4, 1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v3, v5, 1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v4, v0, 1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v5, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX10-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i8: @@ -4704,25 +4617,21 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v4, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v3, v5, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add_nc_u16 v4, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v5, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i8: @@ -4766,25 +4675,21 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v4, v0, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v5, v1, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v4, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v5, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v4, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v3, v5, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX1200-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v4, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v5, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i8> %x, <i8 1, i8 1> @@ -7600,81 +7505,43 @@ entry: } define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { -; GFX67-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_imad_pat_i16_x2: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_imad_pat_i16_x2: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i16_x2: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -7767,19 +7634,19 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -7807,23 +7674,23 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -7860,23 +7727,23 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -7902,79 +7769,42 @@ entry: } define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { -; GFX67-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v3, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v3, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v2, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_umad_pat_i16_x2: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_umad_pat_i16_x2: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_mul_u32_u24_e32 v2, v0, v1 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v3, v2 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v2, v3, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v2, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v2, v2, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v2, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v1, v2, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_i16_x2: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -8063,19 +7893,19 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -8103,23 +7933,23 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -8156,23 +7986,23 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -8268,10 +8098,8 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -8279,9 +8107,9 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v3, 1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -8290,64 +8118,60 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v1, v5, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v5, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v3, v5, 1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v2, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: @@ -8591,10 +8415,8 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -8602,9 +8424,9 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v3, 1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -8613,64 +8435,60 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v1, v5, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v5, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v3, v5, 1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v2, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: @@ -8908,24 +8726,14 @@ entry: } define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { -; GFX67-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v1, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v1, v3 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: multi_use_mul_mad_i16_var: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: multi_use_mul_mad_i16_var: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mad_u32_u24 v0, v4, v1, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v4, v1, v3 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: multi_use_mul_mad_i16_var: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -8973,10 +8781,9 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX10-GISEL-LABEL: multi_use_mul_mad_i16_var: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v0, v2 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8992,12 +8799,10 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX11-GISEL-LABEL: multi_use_mul_mad_i16_var: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v0, v2 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9021,12 +8826,10 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v0, v2 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX1200-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -9108,29 +8911,17 @@ entry: } define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %ptr) { -; GFX67-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX67-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX67-SDAG-NEXT: ds_write_b16 v3, v4 -; GFX67-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v2 -; GFX67-GISEL-NEXT: s_mov_b32 m0, -1 -; GFX67-GISEL-NEXT: ds_write_b16 v3, v1 -; GFX67-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: other_use_mul_mad_i16_var: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX67-NEXT: s_mov_b32 m0, -1 +; GFX67-NEXT: ds_write_b16 v3, v4 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: other_use_mul_mad_i16_var: ; GFX8: ; %bb.0: ; %entry @@ -9151,69 +8942,36 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) % ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX10-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX10-SDAG-NEXT: ds_write_b16 v3, v4 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v1, v2 -; GFX10-GISEL-NEXT: ds_write_b16 v3, v1 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX11-SDAG-NEXT: ds_store_b16 v3, v4 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v1, v2 -; GFX11-GISEL-NEXT: ds_store_b16 v3, v1 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: other_use_mul_mad_i16_var: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX10-NEXT: ds_write_b16 v3, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1200-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX1200-SDAG-NEXT: ds_store_b16 v3, v4 -; GFX1200-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: other_use_mul_mad_i16_var: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX11-NEXT: ds_store_b16 v3, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX1200-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v1, v2 -; GFX1200-GISEL-NEXT: ds_store_b16 v3, v1 -; GFX1200-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1200-LABEL: other_use_mul_mad_i16_var: +; GFX1200: ; %bb.0: ; %entry +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX1200-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX1200-NEXT: ds_store_b16 v3, v4 +; GFX1200-NEXT: s_wait_dscnt 0x0 +; GFX1200-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z @@ -9246,16 +9004,14 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-GISEL-LABEL: multi_use_mul_mad_v2i16_var: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v0, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v2, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v9, v3, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v2, v6 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v9, v3, v7 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: multi_use_mul_mad_v2i16_var: @@ -9366,20 +9122,20 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-GISEL-LABEL: other_use_mul_mad_v2i16_var: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, v5 ; GFX67-GISEL-NEXT: s_mov_b32 m0, -1 -; GFX67-GISEL-NEXT: ds_write_b32 v6, v2 +; GFX67-GISEL-NEXT: ds_write_b32 v6, v7 ; GFX67-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9532,29 +9288,15 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) { ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1200-SDAG-LABEL: mul_u24_add64: -; GFX1200-SDAG: ; %bb.0: -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: mul_u24_add64: -; GFX1200-GISEL: ; %bb.0: -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1200-LABEL: mul_u24_add64: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] +; GFX1200-NEXT: s_setpc_b64 s[30:31] %mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y) %add = add i64 %mul, %z ret i64 %add diff --git a/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll b/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll new file mode 100644 index 000000000000..2e7b2b72088a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/invalid-hidden-kernarg-in-kernel-signature.ll @@ -0,0 +1,21 @@ +; RUN: not llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s +; RUN: not llc -global-isel=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; RUN: not llc -global-isel=1 -global-isel-abort=2 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefixes=ERROR,GISEL %s +; RUN: not llc -global-isel=0 -amdgpu-ir-lower-kernel-arguments=0 -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +define amdgpu_kernel void @no_free_sgprs_block_count_x_no_preload_diag(ptr addrspace(1) inreg %out, i512 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { +; GISEL: warning: Instruction selection used fallback path for no_free_sgprs_block_count_x_no_preload_diag +; ERROR: error: <unknown>:0:0: in function no_free_sgprs_block_count_x_no_preload_diag void (ptr addrspace(1), i512, i32): hidden argument in kernel signature was not preloaded + store i32 %_hidden_block_count_x, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_z_no_preload_diag(ptr addrspace(1) inreg %out, i256 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { +; GISEL: warning: Instruction selection used fallback path for preloadremainder_z_no_preload_diag +; ERROR: error: <unknown>:0:0: in function preloadremainder_z_no_preload_diag void (ptr addrspace(1), i256, i32, i32, i32, i16, i16, i16, i16, i16, i16): hidden argument in kernel signature was not preloaded + %conv = zext i16 %_hidden_remainder_z to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/invalid-inline-asm-constraint-crash.ll b/llvm/test/CodeGen/AMDGPU/invalid-inline-asm-constraint-crash.ll index ec58686630bf..542e8d2db07a 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-inline-asm-constraint-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-inline-asm-constraint-crash.ll @@ -1,4 +1,4 @@ -; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: not llc -mtriple=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s ; ERR: error: couldn't allocate output register for constraint 'q' define void @crash_use_invalid_output_constraint_block(ptr addrspace(1) %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll new file mode 100644 index 000000000000..e8b23f3bf3a7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor %s | FileCheck %s + +@buf_shared = internal addrspace(3) global [2080 x i8] poison, align 16 + +; Constant expression element may not have a pointer type and the +; addrspacecast may not be the toplevel operation. + + +; This should infer "amdgpu-no-flat-scratch-init". It should not infer "amdgpu-no-queue-ptr" +;. +; CHECK: @buf_shared = internal addrspace(3) global [2080 x i8] poison, align 16 +; CHECK: @buf_private = internal addrspace(5) global [2080 x i8] poison, align 16 +;. +define amdgpu_kernel void @issue120256(ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @issue120256( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[CONV_I:%.*]] = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @buf_shared to ptr) to i64)) to i32), 15 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) @buf_shared, i32 [[CONV_I]] +; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) [[ADD_PTR]], align 1 +; CHECK-NEXT: store i8 [[LD]], ptr addrspace(1) [[OUT]], align 1 +; CHECK-NEXT: ret void +; + %conv.i = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @buf_shared to ptr) to i64)) to i32), 15 + %add.ptr = getelementptr inbounds nuw i8, ptr addrspace(3) @buf_shared, i32 %conv.i + %ld = load i8, ptr addrspace(3) %add.ptr, align 1 + store i8 %ld, ptr addrspace(1) %out, align 1 + ret void +} + +@buf_private = internal addrspace(5) global [2080 x i8] poison, align 16 + +; Constant expression element may not have a pointer type and the +; addrspacecast may not be the toplevel operation. + +; This should not infer "amdgpu-no-flat-scratch-init" nor "amdgpu-no-queue-ptr" +define amdgpu_kernel void @issue120256_private(ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @issue120256_private( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[CONV_I:%.*]] = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(5) @buf_private to ptr) to i64)) to i32), 15 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) @buf_private, i32 [[CONV_I]] +; CHECK-NEXT: [[LD:%.*]] = load i8, ptr addrspace(5) [[ADD_PTR]], align 1 +; CHECK-NEXT: store i8 [[LD]], ptr addrspace(1) [[OUT]], align 1 +; CHECK-NEXT: ret void +; + %conv.i = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(5) @buf_private to ptr) to i64)) to i32), 15 + %add.ptr = getelementptr inbounds nuw i8, ptr addrspace(5) @buf_private, i32 %conv.i + %ld = load i8, ptr addrspace(5) %add.ptr, align 1 + store i8 %ld, ptr addrspace(1) %out, align 1 + ret void +} + +!llvm.module.flags = !{!0} + +; FIXME: Inference of amdgpu-no-queue-ptr should not depend on code object version. +!0 = !{i32 1, !"amdhsa_code_object_version", i32 400} +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" } +;. +; CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} +;. diff --git a/llvm/test/CodeGen/AMDGPU/issue48473.mir b/llvm/test/CodeGen/AMDGPU/issue48473.mir index 5c202d9928ab..e272bd348038 100644 --- a/llvm/test/CodeGen/AMDGPU/issue48473.mir +++ b/llvm/test/CodeGen/AMDGPU/issue48473.mir @@ -2,7 +2,7 @@ # RUN: FileCheck -check-prefix=ERR %s < %t.err # ERR: error: register allocation failed: maximum depth for recoloring reached. Use -fexhaustive-register-search to skip cutoffs -# ERR-NEXT: error: ran out of registers during register allocation +# ERR-NEXT: error: <unknown>:0:0: ran out of registers during register allocation # This testcase used to fail with an "overlapping insert" assertion # when trying to roll back an unsucessful recoloring of %25. One of diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll index 397502711283..e70dc8f7a657 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -100,59 +100,29 @@ define void @func_use_lds_global() { ; ERR: warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (ptr addrspace(1)): local memory global used by non-kernel function define void @func_use_lds_global_constexpr_cast(ptr addrspace(1) %out) { -; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast: -; GFX8-SDAG: ; %bb.0: -; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8 -; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-SDAG-NEXT: s_trap 2 -; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-GISEL-LABEL: func_use_lds_global_constexpr_cast: -; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_trap 2 -; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v0 -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: func_use_lds_global_constexpr_cast: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_trap 2 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: func_use_lds_global_constexpr_cast: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_trap 2 +; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: func_use_lds_global_constexpr_cast: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_trap 2 -; GFX9-GISEL-NEXT: global_store_dword v[0:1], v0, off -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: func_use_lds_global_constexpr_cast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_trap 2 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-LABEL: func_use_lds_global_constexpr_cast: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_cbranch_execnz .LBB1_2 -; SDAG-NEXT: ; %bb.1: -; SDAG-NEXT: s_setpc_b64 s[30:31] -; SDAG-NEXT: .LBB1_2: -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: func_use_lds_global_constexpr_cast: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_cbranch_execnz .LBB1_2 -; GISEL-NEXT: ; %bb.1: -; GISEL-NEXT: global_store_dword v[0:1], v0, off -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_setpc_b64 s[30:31] -; GISEL-NEXT: .LBB1_2: -; GISEL-NEXT: s_endpgm +; CHECK-LABEL: func_use_lds_global_constexpr_cast: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_cbranch_execnz .LBB1_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: s_endpgm store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) %out, align 4 ret void } @@ -611,7 +581,3 @@ ret: ret i32 %phi } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} -; GFX8: {{.*}} -; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll index b6232cbc3849..2dade8412b84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-SDAG %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950-GISEL %s declare i32 @llvm.amdgcn.bitop3.i32(i32, i32, i32, i32) declare i16 @llvm.amdgcn.bitop3.i16(i16, i16, i16, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll index 8481a3c2ccdb..f694d55f83b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3)) declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3)) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index c3562708b15d..9e6a85dd2810 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -1146,6 +1146,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm + attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} !0 = !{i64 2862105} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll new file mode 100644 index 000000000000..80f295b93970 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspace(1) %out1, float %in0, <4 x float> %in1) { +; GCN-LABEL: MFMAExpInterleave: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s6, s[4:5], 0x10 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: s_mov_b32 s7, 0x42b17218 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, s6, v1 +; GCN-NEXT: v_rndne_f32_e32 v3, v2 +; GCN-NEXT: v_sub_f32_e32 v4, v2, v3 +; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2 +; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_add_f32_e32 v1, v4, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_exp_f32_e32 v1, v1 +; GCN-NEXT: s_mov_b32 s0, 0x3fb8aa3b +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: ; iglp_opt mask(0x00000003) +; GCN-NEXT: v_ldexp_f32 v1, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 0x42b17218 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GCN-NEXT: s_mov_b32 s6, 0xc2ce8ed0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 +; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 +; GCN-NEXT: v_rndne_f32_e32 v5, v3 +; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 +; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 +; GCN-NEXT: v_add_f32_e32 v3, v3, v4 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 +; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 +; GCN-NEXT: v_rndne_f32_e32 v5, v3 +; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 +; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 +; GCN-NEXT: v_add_f32_e32 v3, v3, v4 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 +; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 +; GCN-NEXT: v_rndne_f32_e32 v5, v3 +; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 +; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 +; GCN-NEXT: v_add_f32_e32 v3, v3, v4 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 +; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 +; GCN-NEXT: v_rndne_f32_e32 v5, v3 +; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 +; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 +; GCN-NEXT: v_add_f32_e32 v3, v3, v4 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 +; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 +; GCN-NEXT: v_rndne_f32_e32 v5, v3 +; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 +; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 +; GCN-NEXT: v_add_f32_e32 v3, v3, v4 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_ldexp_f32 v3, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 +; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 +; GCN-NEXT: v_rndne_f32_e32 v5, v3 +; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 +; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 +; GCN-NEXT: v_add_f32_e32 v3, v3, v4 +; GCN-NEXT: v_exp_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 +; GCN-NEXT: v_ldexp_f32 v0, v3, v4 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GCN-NEXT: v_fma_f32 v3, v0, s0, -v1 +; GCN-NEXT: v_rndne_f32_e32 v4, v1 +; GCN-NEXT: v_fmac_f32_e32 v3, 0x32a5705f, v0 +; GCN-NEXT: v_sub_f32_e32 v1, v1, v4 +; GCN-NEXT: v_add_f32_e32 v1, v1, v3 +; GCN-NEXT: v_exp_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v3, v4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_ldexp_f32 v1, v1, v3 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: global_store_dword v4, v0, s[2:3] +; GCN-NEXT: s_endpgm + %mai0 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in1, i32 0, i32 0, i32 0) + %mai1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai0, i32 0, i32 0, i32 0) + %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai1, i32 0, i32 0, i32 0) + %mai3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai2, i32 0, i32 0, i32 0) + %mai4 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai3, i32 0, i32 0, i32 0) + %mai5 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai4, i32 0, i32 0, i32 0) + %mai6 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai5, i32 0, i32 0, i32 0) + %mai7 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai6, i32 0, i32 0, i32 0) + %exp0 = call float @llvm.exp.f32(float %in0) + %exp1 = call float @llvm.exp.f32(float %exp0) + %exp2 = call float @llvm.exp.f32(float %exp1) + %exp3 = call float @llvm.exp.f32(float %exp2) + %exp4 = call float @llvm.exp.f32(float %exp3) + %exp5 = call float @llvm.exp.f32(float %exp4) + %exp6 = call float @llvm.exp.f32(float %exp5) + %exp7 = call float @llvm.exp.f32(float %exp6) + store <4 x float> %mai7, ptr addrspace(1) %out0 + store float %exp7, ptr addrspace(1) %out1 + tail call void @llvm.amdgcn.iglp.opt(i32 3) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 6150cb5cd947..bc4d35f5a1f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -492,7 +492,6 @@ attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} !0 = !{i64 2862105} - ... --- @@ -899,4 +898,3 @@ body: | S_ENDPGM 0 ... - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll index e3dd036ecc30..19da3f4503aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -1,25 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f32: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15 -; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 -; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 -; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done -; GCN-NEXT: s_endpgm +; GFX11-LABEL: v_interp_f32: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: lds_param_load v0, attr0.y wait_vdst:15 +; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 +; GFX11-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.y wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GFX12-NEXT: export mrt0 v3, v2, v5, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -32,30 +53,55 @@ main_body: } define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f32_many: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15 -; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15 -; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 -; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 -; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 -; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 -; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 -; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done -; GCN-NEXT: s_endpgm +; GFX11-LABEL: v_interp_f32_many: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: lds_param_load v0, attr0.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v2, attr2.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v3, attr3.x wait_vdst:15 +; GFX11-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GFX11-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GFX11-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 +; GFX11-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GFX11-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v2, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -74,30 +120,55 @@ main_body: } define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f32_many_vm: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_mov_b32 s0, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15 -; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 -; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15 -; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 -; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 -; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 -; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 -; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 -; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done -; GCN-NEXT: s_endpgm +; GFX11-LABEL: v_interp_f32_many_vm: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GFX11-NEXT: s_mov_b32 m0, s0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: lds_param_load v2, attr0.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v3, attr1.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v4, attr2.x wait_vdst:15 +; GFX11-NEXT: lds_param_load v5, attr3.x wait_vdst:15 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GFX11-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GFX11-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GFX11-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GFX11-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 +; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many_vm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GFX12-NEXT: s_mov_b32 m0, s0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: ds_param_load v2, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v4, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v5, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v0 done +; GFX12-NEXT: s_endpgm main_body: %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1 %i = load float, ptr addrspace(1) %i.ptr, align 4 @@ -120,23 +191,59 @@ main_body: } define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_f16: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 -; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 -; GCN-NEXT: v_add_f16_e32 v0, v3, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_interp_f16: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_interp_f16: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: v_interp_f16: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0) @@ -148,23 +255,59 @@ main_body: } define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { -; GCN-LABEL: v_interp_rtz_f16: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s3, exec_lo -; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 -; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 -; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 -; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 -; GCN-NEXT: v_add_f16_e32 v0, v3, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_interp_rtz_f16: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_interp_rtz_f16: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: v_interp_rtz_f16: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) @@ -176,17 +319,42 @@ main_body: } define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { -; GCN-LABEL: v_interp_f16_imm_params: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 -; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f16_e32 v0, v1, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: v_interp_f16_imm_params: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 +; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: v_interp_f16_imm_params: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 +; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: v_interp_f16_imm_params: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX12-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX12-NEXT: ; return to shader part epilog main_body: %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0) %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 1b41a10eec3f..e592a4ac5e8f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s +; TODO: Run these for global isel as well. +; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s ; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) @@ -18,11 +21,17 @@ declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 ; Arguments are flattened to represent the actual VGPR_A layout, so we have no ; extra moves in the generated kernel. define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; PRE-GFX12-LABEL: image_bvh_intersect_ray: +; PRE-GFX12: ; %bb.0: ; %main_body +; PRE-GFX12-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] +; PRE-GFX12-NEXT: s_waitcnt vmcnt(0) +; PRE-GFX12-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: image_bvh_intersect_ray: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[5:7], v[8:10]], s[0:3] +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog main_body: %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 @@ -79,6 +88,48 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_lshr_b32 s2, s7, 16 +; GFX12-SDAG-NEXT: s_lshr_b32 s3, s5, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s3, s5, s7 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s4, s6, s8 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 +; GFX12-SDAG-NEXT: s_mov_b32 s15, s12 +; GFX12-SDAG-NEXT: s_mov_b32 s14, s11 +; GFX12-SDAG-NEXT: s_mov_b32 s13, s10 +; GFX12-SDAG-NEXT: s_mov_b32 s12, s9 +; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_mov_b32 s20, s2 +; GFX12-GISEL-NEXT: s_mov_b32 s22, s4 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX12-GISEL-NEXT: s_mov_b32 s21, s3 +; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s7, s5 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX12-GISEL-NEXT: s_mov_b32 s16, s9 +; GFX12-GISEL-NEXT: s_mov_b32 s17, s10 +; GFX12-GISEL-NEXT: s_mov_b32 s18, s11 +; GFX12-GISEL-NEXT: s_mov_b32 s19, s12 +; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[16:19] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> @@ -88,11 +139,17 @@ main_body: ; Arguments are flattened to represent the actual VGPR_A layout, so we have no ; extra moves in the generated kernel. define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; PRE-GFX12-LABEL: image_bvh64_intersect_ray: +; PRE-GFX12: ; %bb.0: ; %main_body +; PRE-GFX12-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] +; PRE-GFX12-NEXT: s_waitcnt vmcnt(0) +; PRE-GFX12-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: image_bvh64_intersect_ray: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[6:8], v[9:11]], s[0:3] +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog main_body: %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 @@ -152,6 +209,50 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_a16: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1 +; GFX12-SDAG-NEXT: s_lshr_b32 s3, s6, 16 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s1, s6, s8 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-SDAG-NEXT: s_lshr_b32 s0, s8, 16 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s3, s7, s9 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s15, s13 +; GFX12-SDAG-NEXT: s_mov_b32 s14, s12 +; GFX12-SDAG-NEXT: s_mov_b32 s13, s11 +; GFX12-SDAG-NEXT: s_mov_b32 s12, s10 +; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_mov_b32 s20, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s21, s4 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s8, s6 +; GFX12-GISEL-NEXT: s_mov_b32 s22, s5 +; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s8, s6 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s9, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; GFX12-GISEL-NEXT: s_mov_b32 s16, s10 +; GFX12-GISEL-NEXT: s_mov_b32 s17, s11 +; GFX12-GISEL-NEXT: s_mov_b32 s18, s12 +; GFX12-GISEL-NEXT: s_mov_b32 s19, s13 +; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[16:19] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> @@ -239,6 +340,69 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh_intersect_ray_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 4.0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1] +; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh_intersect_ray_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 +; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] +; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid @@ -329,6 +493,62 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1] +; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] +; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid @@ -429,6 +649,69 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 +; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v10, 4.0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 +; GFX12-SDAG-NEXT: flat_load_b32 v11, v[0:1] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 +; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 +; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 +; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 +; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v10, 4.0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid @@ -521,6 +804,64 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v7, 4.0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 +; GFX12-SDAG-NEXT: flat_load_b32 v8, v[0:1] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 +; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 2da602713d72..8d380516df8b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s ; FIXME: bfloat vector arguments are broken in globalisel. ; https://github.com/llvm/llvm-project/issues/77055 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 5d149f7c0c62..722c53a9dd60 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index 049cc455ab01..53e37479f68e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -2,6 +2,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; GFX10PLUS-LABEL: {{^}}dpp8_test: ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll index eeef4eeb65a6..2faf375a97a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll @@ -1,5 +1,5 @@ -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.prng.b32(i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index 7bc3864ef5e1..46829b07f265 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK -; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 8c0138aeb0ba..4813a71f5c7b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK -; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll index fa6fc6047eac..dc33c4f0adf2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s declare void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) %rsrc, i32 %offset, i32 %len) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll index b677f7863c14..01df8efbb2e2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GISEL %s define amdgpu_ps void @prefetch_data_sgpr_base_sgpr_len(ptr addrspace(4) inreg %ptr, i32 inreg %len) { ; GCN-LABEL: prefetch_data_sgpr_base_sgpr_len: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index be6ef315e4c7..0a330e91f820 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index 259e3162e8bd..eb2d95e4db2d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) { ; CHECK-LABEL: struct_atomic_buffer_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index d4955d1b01f6..bc50b12b5904 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) { ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll index daec7e9b91e7..0ca96d5a1eb1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll @@ -1,8 +1,8 @@ -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s ; DPP control value 337 is valid for 64-bit DPP on gfx942 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 6b57f20f25e2..84a3a3e88d23 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -163,9 +163,11 @@ define amdgpu_kernel void @ceil_v2f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l ; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 30cc060d05bb..ac515808a0d8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -15,17 +15,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s4 ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; @@ -53,15 +53,15 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v2, v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -91,17 +91,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_f32: @@ -175,25 +175,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_exp_f32_e32 v3, v1 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v2, s0 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; @@ -225,22 +226,23 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v2, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_cselect_b32 s3, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -273,23 +275,24 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, s3, v3 +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, v3, s4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 +; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: @@ -384,29 +387,31 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v1, s1, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_cselect_b32 s1, 0xffffffc0, 0 +; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-SDAG-NEXT: v_add_f32_e32 v3, s0, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 +; SI-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; SI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s0, v6 -; SI-SDAG-NEXT: v_exp_f32_e32 v3, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v1, s1 +; SI-SDAG-NEXT: s_cselect_b32 s1, 0xffffffc0, 0 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v3, s1 +; SI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, v3, v7 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v6, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v2, v2, s0 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm @@ -446,31 +451,34 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v2, s2, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, s4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v4, s2, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v6, s1, v6 -; VI-SDAG-NEXT: v_exp_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 +; VI-SDAG-NEXT: v_add_f32_e32 v3, s1, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_ldexp_f32 v1, v3, s4 +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm ; @@ -510,29 +518,31 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, s2, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s2, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v6, s1, v6 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v6, v6 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[6:7] +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, v4, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 +; GFX900-SDAG-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: @@ -659,35 +669,38 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; SI-SDAG-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v2, s7, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v3, s6, v3 +; SI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v4, v3 +; SI-SDAG-NEXT: s_cselect_b32 s8, 0xffffffc0, 0 +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s6, 0xffffffc0, 0 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e64 v3, v2, s8 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v2, v4, s6 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v4, s5, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v0, s4, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 -; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8 -; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: s_cselect_b32 s6, 0xffffffc0, 0 +; SI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v1, v4, s6 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s4 ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; @@ -733,34 +746,37 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, s2, v3 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v3 +; VI-SDAG-NEXT: s_cselect_b32 s6, 0xffffffc0, 0 +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v3, v2, s6 +; VI-SDAG-NEXT: v_ldexp_f32 v2, v4, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 -; VI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 -; VI-SDAG-NEXT: v_exp_f32_e32 v8, v8 -; VI-SDAG-NEXT: v_exp_f32_e32 v9, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_ldexp_f32 v1, v4, s2 +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -807,34 +823,37 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, s2, v3 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v5, v3 +; GFX900-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v3, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, v2, s4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v5, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v5, s3, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v7, s2, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v9, s1, v9 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v5, s1, v5 +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v5, v5 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v7, v7 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v9, v9 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v10, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, v5, v2 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, v5, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -973,19 +992,19 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) } define float @v_exp2_f32(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32: ; GCN-GISEL: ; %bb.0: @@ -1001,6 +1020,34 @@ define float @v_exp2_f32(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1015,19 +1062,19 @@ define float @v_exp2_f32(float %in) { } define float @v_exp2_fabs_f32(float %in) { -; GCN-SDAG-LABEL: v_exp2_fabs_f32: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_fabs_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_fabs_f32: ; GCN-GISEL: ; %bb.0: @@ -1043,6 +1090,34 @@ define float @v_exp2_fabs_f32(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_fabs_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_fabs_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_fabs_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1058,19 +1133,19 @@ define float @v_exp2_fabs_f32(float %in) { } define float @v_exp2_fneg_fabs_f32(float %in) { -; GCN-SDAG-LABEL: v_exp2_fneg_fabs_f32: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_sub_f32_e64 v0, v2, |v0| -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_fneg_fabs_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e64 v0, v2, |v0| +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32: ; GCN-GISEL: ; %bb.0: @@ -1086,6 +1161,34 @@ define float @v_exp2_fneg_fabs_f32(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_fneg_fabs_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e64 v0, v2, |v0| +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e64 v0, v2, |v0| +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_fneg_fabs_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1102,19 +1205,19 @@ define float @v_exp2_fneg_fabs_f32(float %in) { } define float @v_exp2_fneg_f32(float %in) { -; GCN-SDAG-LABEL: v_exp2_fneg_f32: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 -; GCN-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_fneg_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_fneg_f32: ; GCN-GISEL: ; %bb.0: @@ -1130,6 +1233,34 @@ define float @v_exp2_fneg_f32(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_fneg_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_fneg_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x42fc0000 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_fneg_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1145,19 +1276,19 @@ define float @v_exp2_fneg_f32(float %in) { } define float @v_exp2_f32_fast(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32_fast: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_fast: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_fast: ; GCN-GISEL: ; %bb.0: @@ -1173,6 +1304,34 @@ define float @v_exp2_f32_fast(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_fast: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_fast: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_fast: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1187,19 +1346,19 @@ define float @v_exp2_f32_fast(float %in) { } define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { -; GCN-SDAG-LABEL: v_exp2_f32_unsafe_math_attr: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_unsafe_math_attr: ; GCN-GISEL: ; %bb.0: @@ -1215,6 +1374,34 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_unsafe_math_attr: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_unsafe_math_attr: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1229,19 +1416,19 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { } define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { -; GCN-SDAG-LABEL: v_exp2_f32_approx_fn_attr: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_approx_fn_attr: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_approx_fn_attr: ; GCN-GISEL: ; %bb.0: @@ -1257,6 +1444,34 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_approx_fn_attr: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_approx_fn_attr: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_approx_fn_attr: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1271,19 +1486,19 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" } define float @v_exp2_f32_ninf(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32_ninf: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_ninf: ; GCN-GISEL: ; %bb.0: @@ -1299,6 +1514,34 @@ define float @v_exp2_f32_ninf(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_ninf: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_ninf: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1313,19 +1556,19 @@ define float @v_exp2_f32_ninf(float %in) { } define float @v_exp2_f32_afn(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32_afn: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_afn: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_afn: ; GCN-GISEL: ; %bb.0: @@ -1341,6 +1584,34 @@ define float @v_exp2_f32_afn(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_afn: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_afn: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1375,19 +1646,19 @@ define float @v_exp2_f32_afn_daz(float %in) #0 { } define float @v_exp2_f32_afn_dynamic(float %in) #1 { -; GCN-SDAG-LABEL: v_exp2_f32_afn_dynamic: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_afn_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_afn_dynamic: ; GCN-GISEL: ; %bb.0: @@ -1403,6 +1674,34 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_afn_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_afn_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_afn_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1417,19 +1716,19 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 { } define float @v_fabs_exp2_f32_afn(float %in) { -; GCN-SDAG-LABEL: v_fabs_exp2_f32_afn: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_fabs_exp2_f32_afn: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_fabs_exp2_f32_afn: ; GCN-GISEL: ; %bb.0: @@ -1445,6 +1744,34 @@ define float @v_fabs_exp2_f32_afn(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_fabs_exp2_f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_fabs_exp2_f32_afn: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e64 v0, |v0|, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_fabs_exp2_f32_afn: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1480,19 +1807,19 @@ define float @v_exp2_f32_daz(float %in) #0 { } define float @v_exp2_f32_nnan(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32_nnan: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_nnan: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_nnan: ; GCN-GISEL: ; %bb.0: @@ -1508,6 +1835,34 @@ define float @v_exp2_f32_nnan(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_nnan: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_nnan: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1542,19 +1897,19 @@ define float @v_exp2_f32_nnan_daz(float %in) #0 { } define float @v_exp2_f32_nnan_dynamic(float %in) #1 { -; GCN-SDAG-LABEL: v_exp2_f32_nnan_dynamic: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_nnan_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic: ; GCN-GISEL: ; %bb.0: @@ -1570,6 +1925,34 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_nnan_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_nnan_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1604,19 +1987,19 @@ define float @v_exp2_f32_ninf_daz(float %in) #0 { } define float @v_exp2_f32_ninf_dynamic(float %in) #1 { -; GCN-SDAG-LABEL: v_exp2_f32_ninf_dynamic: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_ninf_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic: ; GCN-GISEL: ; %bb.0: @@ -1632,6 +2015,34 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_ninf_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_ninf_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_ninf_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1646,19 +2057,19 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 { } define float @v_exp2_f32_nnan_ninf(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32_nnan_ninf: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf: ; GCN-GISEL: ; %bb.0: @@ -1674,6 +2085,34 @@ define float @v_exp2_f32_nnan_ninf(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan_ninf: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1708,19 +2147,19 @@ define float @v_exp2_f32_nnan_ninf_daz(float %in) #0 { } define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 { -; GCN-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic: ; GCN-GISEL: ; %bb.0: @@ -1736,6 +2175,34 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1770,19 +2237,19 @@ define float @v_exp2_f32_fast_daz(float %in) #0 { } define float @v_exp2_f32_dynamic_mode(float %in) #1 { -; GCN-SDAG-LABEL: v_exp2_f32_dynamic_mode: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_dynamic_mode: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode: ; GCN-GISEL: ; %bb.0: @@ -1798,6 +2265,34 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_dynamic_mode: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_dynamic_mode: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_dynamic_mode: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1912,9 +2407,9 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_f32_from_fpext_math_f16: @@ -1978,9 +2473,9 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) { ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-NEXT: v_add_f32_e32 v0, v0, v2 ; SI-NEXT: v_exp_f32_e32 v0, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; SI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_not_b32_e32 v1, 63 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_f32_from_fpext_bf16: @@ -1993,9 +2488,9 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) { ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-NEXT: v_exp_f32_e32 v0, v0 -; VI-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; VI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_not_b32_e32 v1, 63 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_exp2_f32_from_fpext_bf16: @@ -2008,9 +2503,9 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX900-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX900-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_not_b32_e32 v1, 63 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_from_fpext_bf16: @@ -2850,19 +3345,19 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) { } define float @v_exp2_f32_contract(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32_contract: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_contract: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_contract: ; GCN-GISEL: ; %bb.0: @@ -2878,6 +3373,34 @@ define float @v_exp2_f32_contract(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_contract: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_contract: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_contract: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -2912,19 +3435,19 @@ define float @v_exp2_f32_contract_daz(float %in) #0 { } define float @v_exp2_f32_contract_nnan_ninf(float %in) { -; GCN-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GCN-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf: ; GCN-GISEL: ; %bb.0: @@ -2940,6 +3463,34 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) { ; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_not_b32_e32 v1, 63 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_contract_nnan_ninf: ; R600: ; %bb.0: ; R600-NEXT: CF_END diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 4f206b82fdd6..9909cfd32b11 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -164,9 +164,11 @@ define amdgpu_kernel void @floor_v2f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l ; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 72e86f1f6f99..53c26cadbf75 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -480,11 +480,9 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32: @@ -610,12 +608,11 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16: @@ -744,12 +741,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v3, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32: @@ -900,7 +896,7 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0 +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16: @@ -1043,24 +1039,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32: @@ -1257,8 +1250,8 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 279ffeab51fb..218e41faa703 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -17,19 +17,19 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3377d1cf -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s1, v2 ; SI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 @@ -73,11 +73,11 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -134,27 +134,27 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s1, v3 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s1, v3 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX900-SDAG-NEXT: global_store_dword v0, v1, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: @@ -190,21 +190,23 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -314,38 +316,39 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s3, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s3, 0x3f317217 ; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s3, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s8, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s3, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s8, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 +; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v3 ; SI-SDAG-NEXT: v_fma_f32 v4, v0, s8, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -394,43 +397,44 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s7, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s6, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 @@ -488,36 +492,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 ; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s11, v3 -; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s2, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s3, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s10, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v3, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v3 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v4 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 ; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-SDAG-NEXT: s_endpgm @@ -564,31 +569,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v5, 0x3377d1cf, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5 ; GFX1100-SDAG-NEXT: v_fma_f32 v4, 0x3f317217, v0, -v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3 -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v4, 0x3377d1cf, v0 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| ; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm @@ -742,49 +753,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s11, 0x3377d1cf +; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s9, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s9, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s9, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s11, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s9, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s8, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s8, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s9, -v5 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s11, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v3 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v4 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s10, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s12 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5 +; SI-SDAG-NEXT: v_fma_f32 v4, v5, s9, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, v5, s11, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v5|, s12 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -847,47 +860,49 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s9, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 -; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v3, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v1 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 @@ -978,39 +993,41 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s4, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s5, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s9, v4 -; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s4, -v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s4, -v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v6, s[2:3] +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 @@ -1079,48 +1096,54 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s2, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v8, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3f317217, v0, -v3 +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v2 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3f317217, v1, -v4 -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v6, 0x3377d1cf, v0 -; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v8, 0x3377d1cf, v2 :: v_dual_cndmask_b32 v1, v1, v4 -; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_mov_b32 v4, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v1, v10 +; GFX1100-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v10 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc_lo ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1343,60 +1366,63 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf +; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s11, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s11, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s11, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s12, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s12, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s13 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v4, v3 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s10, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v4 -; SI-SDAG-NEXT: v_fma_f32 v6, v4, s11, -v2 -; SI-SDAG-NEXT: v_fma_f32 v6, v4, s12, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6 -; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v2 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, -v1 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s12, v5 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s9, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s13 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; SI-SDAG-NEXT: v_fma_f32 v7, v6, s11, -v4 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s8, v0 +; SI-SDAG-NEXT: v_fma_f32 v6, v5, s11, -v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_fma_f32 v7, v6, s12, v7 -; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v4 +; SI-SDAG-NEXT: v_fma_f32 v6, v5, s12, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s13 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 +; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v5 ; SI-SDAG-NEXT: v_fma_f32 v6, v0, s12, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -1470,78 +1496,81 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v4, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v4, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41b17218 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v2, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v5, s9, v5 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_sub_f32_e32 v6, v5, v1 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6 -; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, v6, v4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3f317000, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 @@ -1635,60 +1664,63 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x41b17218 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x41b17218 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s5, v3 +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s11 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 -; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v3 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v5 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s4, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s9, v7 -; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v2 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s4, -v1 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 +; GFX900-SDAG-NEXT: v_ldexp_f32 v6, s9, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s11 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s4, -v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v6 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s4, -v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s5, v8 -; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v5 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s5, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s11 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[2:3] +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v6 ; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm @@ -1760,56 +1792,65 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s1, v2 :: v_dual_mul_f32 v3, s0, v3 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s1, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s0, v3 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317217, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v7, v7, v12 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3f317217, v0, -v5 ; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3f317217, v1, -v6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7 -; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3 ; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc_lo ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] @@ -2062,10 +2103,10 @@ define float @v_log_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2108,10 +2149,10 @@ define float @v_log_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2160,10 +2201,10 @@ define float @v_log_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2206,21 +2247,22 @@ define float @v_log_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2264,10 +2306,10 @@ define float @v_log_fabs_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2310,10 +2352,10 @@ define float @v_log_fabs_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2362,10 +2404,10 @@ define float @v_log_fabs_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2409,20 +2451,22 @@ define float @v_log_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2467,10 +2511,10 @@ define float @v_log_fneg_fabs_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2513,10 +2557,10 @@ define float @v_log_fneg_fabs_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2565,10 +2609,10 @@ define float @v_log_fneg_fabs_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2612,20 +2656,22 @@ define float @v_log_fneg_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2671,10 +2717,10 @@ define float @v_log_fneg_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2717,10 +2763,10 @@ define float @v_log_fneg_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2769,10 +2815,10 @@ define float @v_log_fneg_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -2815,21 +2861,22 @@ define float @v_log_fneg_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2875,9 +2922,9 @@ define float @v_log_f32_fast(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2902,9 +2949,9 @@ define float @v_log_f32_fast(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2929,9 +2976,9 @@ define float @v_log_f32_fast(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2955,10 +3002,12 @@ define float @v_log_f32_fast(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -2993,9 +3042,9 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3020,9 +3069,9 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3047,9 +3096,9 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3073,10 +3122,12 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3111,9 +3162,9 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3138,9 +3189,9 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3165,9 +3216,9 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3191,10 +3242,12 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3228,10 +3281,10 @@ define float @v_log_f32_ninf(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -3274,10 +3327,10 @@ define float @v_log_f32_ninf(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -3326,10 +3379,10 @@ define float @v_log_f32_ninf(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -3372,21 +3425,22 @@ define float @v_log_f32_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3431,9 +3485,9 @@ define float @v_log_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3458,9 +3512,9 @@ define float @v_log_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3485,9 +3539,9 @@ define float @v_log_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3511,10 +3565,12 @@ define float @v_log_f32_afn(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3578,9 +3634,9 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3605,9 +3661,9 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3632,9 +3688,9 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3658,10 +3714,12 @@ define float @v_log_f32_afn_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3696,9 +3754,9 @@ define float @v_fabs_log_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3723,9 +3781,9 @@ define float @v_fabs_log_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3750,9 +3808,9 @@ define float @v_fabs_log_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3777,10 +3835,11 @@ define float @v_fabs_log_f32_afn(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 @@ -3956,10 +4015,10 @@ define float @v_log_f32_nnan(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4002,10 +4061,10 @@ define float @v_log_f32_nnan(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -4054,10 +4113,10 @@ define float @v_log_f32_nnan(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4100,21 +4159,22 @@ define float @v_log_f32_nnan(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4298,10 +4358,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4344,10 +4404,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -4396,10 +4456,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4442,21 +4502,22 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4640,10 +4701,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4686,10 +4747,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -4738,10 +4799,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4784,21 +4845,22 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4842,10 +4904,10 @@ define float @v_log_f32_nnan_ninf(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4882,10 +4944,10 @@ define float @v_log_f32_nnan_ninf(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -4928,10 +4990,10 @@ define float @v_log_f32_nnan_ninf(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -4968,18 +5030,20 @@ define float @v_log_f32_nnan_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3377d1cf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5123,10 +5187,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -5163,10 +5227,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -5209,10 +5273,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -5249,18 +5313,20 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3377d1cf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5330,10 +5396,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -5376,10 +5442,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -5428,10 +5494,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -5474,21 +5540,22 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6007,17 +6074,17 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3377d1cf -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 @@ -6179,10 +6246,10 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x800000 -; SI-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-NEXT: v_log_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s4, 0x3f317217 ; SI-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -6203,10 +6270,10 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_mov_b32 s4, 0x800000 -; VI-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_log_f32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -6230,10 +6297,10 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-NEXT: v_log_f32_e32 v0, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x3f317217 ; GFX900-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 @@ -6255,20 +6322,22 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index df880164b196..fd50d1b60fbd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -17,19 +17,19 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3284fbcf -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s1, v2 ; SI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 @@ -73,11 +73,11 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -134,27 +134,27 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s1, v3 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s1, v3 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX900-SDAG-NEXT: global_store_dword v0, v1, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: @@ -190,21 +190,23 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -314,38 +316,39 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf ; SI-SDAG-NEXT: s_mov_b32 s9, 0x7f800000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s3, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s3, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s3, 0x3e9a209a ; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s3, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s8, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s3, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s8, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s9 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 +; SI-SDAG-NEXT: v_fma_f32 v4, v0, s3, -v3 ; SI-SDAG-NEXT: v_fma_f32 v4, v0, s8, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s9 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -394,43 +397,44 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s7, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s6, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s6, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v2, v1 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 +; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v4, v0, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 @@ -488,36 +492,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a ; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s11, v3 -; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s2, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v3, s3, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v3|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s10, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v3, v1 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v3 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s2, -v4 ; GFX900-SDAG-NEXT: v_fma_f32 v5, v0, s3, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 ; GFX900-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-SDAG-NEXT: s_endpgm @@ -564,31 +569,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v5, 0x3284fbcf, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5 ; GFX1100-SDAG-NEXT: v_fma_f32 v4, 0x3e9a209a, v0, -v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3 -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v4, 0x3284fbcf, v0 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| ; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX1100-SDAG-NEXT: s_endpgm @@ -742,49 +753,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s11, 0x3284fbcf +; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s9, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s9, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s12, 0x7f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s9, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v1, s11, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s9, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s8, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s8, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s10, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s9, -v5 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v3, s11, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2 -; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v3 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s9, -v4 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, v5 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s12 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s10, v0 +; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s12 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5 +; SI-SDAG-NEXT: v_fma_f32 v4, v5, s9, -v2 +; SI-SDAG-NEXT: v_fma_f32 v4, v5, s11, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v5|, s12 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -847,47 +860,49 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s9, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 -; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v3, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v3 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v1 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] ; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 @@ -978,39 +993,41 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s10, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s10, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x7f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v4 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s4, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s5, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s10 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s9, v4 -; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s9, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s4, -v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v4 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s4, -v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v4, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v4, s5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s10 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v6, s[2:3] +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] ; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 @@ -1079,48 +1096,54 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s2, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v8, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v6, 0x3e9a209a, v0, -v3 +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v2 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: v_fma_f32 v7, 0x3e9a209a, v1, -v4 -; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v6, 0x3284fbcf, v0 -; GFX1100-SDAG-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 -; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v8, 0x3284fbcf, v2 :: v_dual_cndmask_b32 v1, v1, v4 -; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_mov_b32 v4, 0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v1, v10 +; GFX1100-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v10 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc_lo ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1343,60 +1366,63 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf +; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s11, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s11, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s13, 0x7f800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s11, -v3 -; SI-SDAG-NEXT: v_fma_f32 v4, v2, s12, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s11, -v2 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s12, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s13 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; SI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 -; SI-SDAG-NEXT: v_log_f32_e32 v4, v3 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, s10, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v4 -; SI-SDAG-NEXT: v_fma_f32 v6, v4, s11, -v2 -; SI-SDAG-NEXT: v_fma_f32 v6, v4, s12, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6 -; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v2 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s11, -v1 +; SI-SDAG-NEXT: v_fma_f32 v5, v2, s12, v5 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s9, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s13 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; SI-SDAG-NEXT: v_fma_f32 v7, v6, s11, -v4 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s8, v0 +; SI-SDAG-NEXT: v_fma_f32 v6, v5, s11, -v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_fma_f32 v7, v6, s12, v7 -; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3] -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v4 +; SI-SDAG-NEXT: v_fma_f32 v6, v5, s12, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s13 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] +; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; SI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 +; SI-SDAG-NEXT: v_fma_f32 v6, v0, s11, -v5 ; SI-SDAG-NEXT: v_fma_f32 v6, v0, s12, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s13 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -1470,78 +1496,81 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v5, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v6, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v4, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v4, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x411a209b +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 +; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v2, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v5, s9, v5 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-SDAG-NEXT: v_sub_f32_e32 v6, v5, v1 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s9, v6 -; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v4|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v6 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v7, v6, v4 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3e9a2000, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[2:3] -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v4 -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s6 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 +; VI-SDAG-NEXT: v_and_b32_e32 v5, 0xfffff000, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, v0, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v8, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v7, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v5, v6 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 @@ -1635,60 +1664,63 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s11, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s11, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s11, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: s_mov_b32 s11, 0x7f800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x411a209b -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s4, -v3 -; GFX900-SDAG-NEXT: v_fma_f32 v5, v2, s5, v5 -; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x411a209b +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v3, v1, s5, v3 +; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, s11 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s10, v3 -; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v3 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v2, v3 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v2, s10, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v1, v3 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s9, v0 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v5 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s4, -v2 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v5, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s9, v7 -; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v2 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s4, -v1 +; GFX900-SDAG-NEXT: v_fma_f32 v6, v2, s5, v6 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 +; GFX900-SDAG-NEXT: v_ldexp_f32 v6, s9, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s11 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s8, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s8, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s4, -v5 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v1, v2 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v6 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, s8, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s4, -v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v8, v7, s5, v8 -; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3] -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v5 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v6, s5, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, s11 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[2:3] +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v1, v6 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s4, -v6 ; GFX900-SDAG-NEXT: v_fma_f32 v7, v0, s5, v7 -; GFX900-SDAG-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX900-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s11 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[0:1] ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm @@ -1760,56 +1792,65 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s1, v2 :: v_dual_mul_f32 v3, s0, v3 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s1, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s0, v3 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v7, v7, v12 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fma_f32 v10, 0x3e9a209a, v0, -v5 ; GFX1100-SDAG-NEXT: v_fma_f32 v11, 0x3e9a209a, v1, -v6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7 -; GFX1100-SDAG-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3 ; GFX1100-SDAG-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_sub_f32 v2, v1, v9 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc_lo ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] @@ -2062,10 +2103,10 @@ define float @v_log10_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2108,10 +2149,10 @@ define float @v_log10_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2160,10 +2201,10 @@ define float @v_log10_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2206,21 +2247,22 @@ define float @v_log10_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2264,10 +2306,10 @@ define float @v_log10_fabs_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2310,10 +2352,10 @@ define float @v_log10_fabs_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2362,10 +2404,10 @@ define float @v_log10_fabs_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2409,20 +2451,22 @@ define float @v_log10_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2467,10 +2511,10 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2513,10 +2557,10 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2565,10 +2609,10 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2612,20 +2656,22 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2671,10 +2717,10 @@ define float @v_log10_fneg_f32(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2717,10 +2763,10 @@ define float @v_log10_fneg_f32(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -2769,10 +2815,10 @@ define float @v_log10_fneg_f32(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -2815,21 +2861,22 @@ define float @v_log10_fneg_f32(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2875,9 +2922,9 @@ define float @v_log10_f32_fast(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2902,9 +2949,9 @@ define float @v_log10_f32_fast(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2929,9 +2976,9 @@ define float @v_log10_f32_fast(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2955,10 +3002,12 @@ define float @v_log10_f32_fast(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -2993,9 +3042,9 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3020,9 +3069,9 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3047,9 +3096,9 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3073,10 +3122,12 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3111,9 +3162,9 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3138,9 +3189,9 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3165,9 +3216,9 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3191,10 +3242,12 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3228,10 +3281,10 @@ define float @v_log10_f32_ninf(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -3274,10 +3327,10 @@ define float @v_log10_f32_ninf(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -3326,10 +3379,10 @@ define float @v_log10_f32_ninf(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -3372,21 +3425,22 @@ define float @v_log10_f32_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3431,9 +3485,9 @@ define float @v_log10_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3458,9 +3512,9 @@ define float @v_log10_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3485,9 +3539,9 @@ define float @v_log10_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3511,10 +3565,12 @@ define float @v_log10_f32_afn(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3578,9 +3634,9 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3605,9 +3661,9 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3632,9 +3688,9 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3658,10 +3714,12 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3696,9 +3754,9 @@ define float @v_fabs_log10_f32_afn(float %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3723,9 +3781,9 @@ define float @v_fabs_log10_f32_afn(float %in) { ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3750,9 +3808,9 @@ define float @v_fabs_log10_f32_afn(float %in) { ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3777,10 +3835,11 @@ define float @v_fabs_log10_f32_afn(float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 @@ -3956,10 +4015,10 @@ define float @v_log10_f32_nnan(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4002,10 +4061,10 @@ define float @v_log10_f32_nnan(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -4054,10 +4113,10 @@ define float @v_log10_f32_nnan(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4100,21 +4159,22 @@ define float @v_log10_f32_nnan(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4298,10 +4358,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4344,10 +4404,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -4396,10 +4456,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4442,21 +4502,22 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4640,10 +4701,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4686,10 +4747,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -4738,10 +4799,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4784,21 +4845,22 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -4842,10 +4904,10 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4882,10 +4944,10 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -4928,10 +4990,10 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -4968,18 +5030,20 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3284fbcf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5123,10 +5187,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -5163,10 +5227,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -5209,10 +5273,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -5249,18 +5313,20 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3284fbcf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5330,10 +5396,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -5376,10 +5442,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -5428,10 +5494,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -5474,21 +5540,22 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -6007,17 +6074,17 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3284fbcf -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 @@ -6179,10 +6246,10 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x800000 -; SI-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-NEXT: v_log_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s4, 0x3e9a209a ; SI-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -6203,10 +6270,10 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_mov_b32 s4, 0x800000 -; VI-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_log_f32_e32 v0, v0 ; VI-NEXT: s_mov_b32 s4, 0x7f800000 ; VI-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 @@ -6230,10 +6297,10 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-NEXT: v_log_f32_e32 v0, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x3e9a209a ; GFX900-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 @@ -6255,20 +6322,22 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index c5dea7fd8b4b..2c5a9f58a199 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -22,9 +22,9 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -59,9 +59,9 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -98,9 +98,9 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] @@ -132,11 +132,12 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -215,24 +216,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v1 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s3, v3 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s2, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v4, v1 ; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2 -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; @@ -265,21 +267,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_log_f32_e32 v2, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -313,21 +316,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 ; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -362,13 +366,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s5 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s3, v1 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s3, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 @@ -469,28 +476,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s1, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s0, v6 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s1, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, s2, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s0, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v7, v0 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc ; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v2 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v7 -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v5 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v5, v4 ; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm @@ -533,27 +542,29 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s1, v6 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_ldexp_f32 v5, s1, v5 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v6, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v5, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -595,27 +606,29 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s1, v6 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 -; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; GFX900-SDAG-NEXT: v_ldexp_f32 v5, s1, v5 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v5 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v5, v4 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v6, v0 ; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -654,28 +667,35 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_clause 0x1 ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s6 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s2, v2 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s1, v4 :: v_dual_mul_f32 v5, s0, v5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v4, 5, v4 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v4, s1, v4 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v5, s0, v5 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v4, v1 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s2, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[4:5] ; GFX1100-SDAG-NEXT: s_endpgm @@ -806,34 +826,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; SI-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; SI-SDAG-NEXT: v_log_f32_e32 v8, v8 -; SI-SDAG-NEXT: v_log_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 -; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 -; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v7, 5, v7 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v3, s7, v3 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v5, s6, v5 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v7, s5, v7 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; SI-SDAG-NEXT: v_log_f32_e32 v7, v7 +; SI-SDAG-NEXT: v_log_f32_e32 v8, v1 +; SI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; SI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4 +; SI-SDAG-NEXT: v_sub_f32_e32 v1, v7, v6 +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v8, v0 ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; @@ -880,33 +903,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; VI-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v5, 5, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 -; VI-SDAG-NEXT: v_log_f32_e32 v6, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v8, v8 -; VI-SDAG-NEXT: v_log_f32_e32 v9, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; VI-SDAG-NEXT: v_ldexp_f32 v5, s2, v5 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v7, 5, v7 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 +; VI-SDAG-NEXT: v_ldexp_f32 v7, s1, v7 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v7 +; VI-SDAG-NEXT: v_log_f32_e32 v8, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, v7, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v8, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -954,33 +980,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v5, s3, v5 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v7, s2, v7 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v9, s1, v9 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX900-SDAG-NEXT: v_log_f32_e32 v5, v5 -; GFX900-SDAG-NEXT: v_log_f32_e32 v7, v7 -; GFX900-SDAG-NEXT: v_log_f32_e32 v9, v9 -; GFX900-SDAG-NEXT: v_log_f32_e32 v10, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v8, 5, v8 +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX900-SDAG-NEXT: v_ldexp_f32 v6, s2, v6 +; GFX900-SDAG-NEXT: v_ldexp_f32 v8, s1, v8 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v3, v3 +; GFX900-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX900-SDAG-NEXT: v_log_f32_e32 v8, v8 +; GFX900-SDAG-NEXT: v_log_f32_e32 v9, v1 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-SDAG-NEXT: s_endpgm ; @@ -1025,34 +1054,42 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-NEXT: s_clause 0x1 ; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s1 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s6 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s3, v2 :: v_dual_mul_f32 v3, s2, v3 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s1, v6 :: v_dual_mul_f32 v7, s0, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, s9 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s6 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v7, 5, v7 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v2, s3, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v7, s0, v7 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v6, 5, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v2, v0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v6, s1, v6 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v8, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v7, v5 +; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v1, v6, v4 ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[4:5] ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -1192,19 +1229,19 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) } define float @v_log2_f32(float %in) { -; GFX689-SDAG-LABEL: v_log2_f32: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32: ; GFX689-GISEL: ; %bb.0: @@ -1220,14 +1257,44 @@ define float @v_log2_f32(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1260,19 +1327,19 @@ define float @v_log2_f32(float %in) { } define float @v_log2_fabs_f32(float %in) { -; GFX689-SDAG-LABEL: v_log2_fabs_f32: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_fabs_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_fabs_f32: ; GFX689-GISEL: ; %bb.0: @@ -1288,15 +1355,44 @@ define float @v_log2_fabs_f32(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_fabs_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_fabs_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_fabs_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1331,19 +1427,19 @@ define float @v_log2_fabs_f32(float %in) { } define float @v_log2_fneg_fabs_f32(float %in) { -; GFX689-SDAG-LABEL: v_log2_fneg_fabs_f32: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_fneg_fabs_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -|v0|, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32: ; GFX689-GISEL: ; %bb.0: @@ -1359,15 +1455,44 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_fneg_fabs_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_fneg_fabs_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e64 s0, 0x80800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -|v0|, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -|v0|, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1403,19 +1528,19 @@ define float @v_log2_fneg_fabs_f32(float %in) { } define float @v_log2_fneg_f32(float %in) { -; GFX689-SDAG-LABEL: v_log2_fneg_f32: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x80800000 -; GFX689-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_fneg_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, -v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_fneg_f32: ; GFX689-GISEL: ; %bb.0: @@ -1431,14 +1556,44 @@ define float @v_log2_fneg_f32(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_fneg_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, -v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_fneg_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x80800000 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, -v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_fneg_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, -v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1473,19 +1628,19 @@ define float @v_log2_fneg_f32(float %in) { } define float @v_log2_f32_fast(float %in) { -; GFX689-SDAG-LABEL: v_log2_f32_fast: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_fast: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_fast: ; GFX689-GISEL: ; %bb.0: @@ -1501,14 +1656,44 @@ define float @v_log2_f32_fast(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_fast: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_fast: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_fast: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1541,19 +1726,19 @@ define float @v_log2_f32_fast(float %in) { } define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { -; GFX689-SDAG-LABEL: v_log2_f32_unsafe_math_attr: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_unsafe_math_attr: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr: ; GFX689-GISEL: ; %bb.0: @@ -1569,14 +1754,44 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_unsafe_math_attr: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_unsafe_math_attr: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1609,19 +1824,19 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { } define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { -; GFX689-SDAG-LABEL: v_log2_f32_approx_fn_attr: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_approx_fn_attr: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr: ; GFX689-GISEL: ; %bb.0: @@ -1637,14 +1852,44 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_approx_fn_attr: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_approx_fn_attr: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1677,19 +1922,19 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" } define float @v_log2_f32_ninf(float %in) { -; GFX689-SDAG-LABEL: v_log2_f32_ninf: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_ninf: ; GFX689-GISEL: ; %bb.0: @@ -1705,14 +1950,44 @@ define float @v_log2_f32_ninf(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_ninf: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_ninf: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1745,19 +2020,19 @@ define float @v_log2_f32_ninf(float %in) { } define float @v_log2_f32_afn(float %in) { -; GFX689-SDAG-LABEL: v_log2_f32_afn: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_afn: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_afn: ; GFX689-GISEL: ; %bb.0: @@ -1773,14 +2048,44 @@ define float @v_log2_f32_afn(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_afn: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_afn: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1839,19 +2144,19 @@ define float @v_log2_f32_afn_daz(float %in) #0 { } define float @v_log2_f32_afn_dynamic(float %in) #1 { -; GFX689-SDAG-LABEL: v_log2_f32_afn_dynamic: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_afn_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic: ; GFX689-GISEL: ; %bb.0: @@ -1867,14 +2172,44 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_afn_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_afn_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1907,19 +2242,19 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { } define float @v_fabs_log2_f32_afn(float %in) { -; GFX689-SDAG-LABEL: v_fabs_log2_f32_afn: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_fabs_log2_f32_afn: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn: ; GFX689-GISEL: ; %bb.0: @@ -1935,15 +2270,44 @@ define float @v_fabs_log2_f32_afn(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_fabs_log2_f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_fabs_log2_f32_afn: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_mul_f32_e64 v0, |v0|, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2004,19 +2368,19 @@ define float @v_log2_f32_daz(float %in) #0 { } define float @v_log2_f32_nnan(float %in) { -; GFX689-SDAG-LABEL: v_log2_f32_nnan: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_nnan: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_nnan: ; GFX689-GISEL: ; %bb.0: @@ -2032,14 +2396,44 @@ define float @v_log2_f32_nnan(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_nnan: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_nnan: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2098,19 +2492,19 @@ define float @v_log2_f32_nnan_daz(float %in) #0 { } define float @v_log2_f32_nnan_dynamic(float %in) #1 { -; GFX689-SDAG-LABEL: v_log2_f32_nnan_dynamic: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_nnan_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic: ; GFX689-GISEL: ; %bb.0: @@ -2126,14 +2520,44 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_nnan_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_nnan_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2192,19 +2616,19 @@ define float @v_log2_f32_ninf_daz(float %in) #0 { } define float @v_log2_f32_ninf_dynamic(float %in) #1 { -; GFX689-SDAG-LABEL: v_log2_f32_ninf_dynamic: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_ninf_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic: ; GFX689-GISEL: ; %bb.0: @@ -2220,14 +2644,44 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_ninf_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_ninf_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2260,19 +2714,19 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { } define float @v_log2_f32_nnan_ninf(float %in) { -; GFX689-SDAG-LABEL: v_log2_f32_nnan_ninf: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf: ; GFX689-GISEL: ; %bb.0: @@ -2288,14 +2742,44 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_nnan_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2354,19 +2838,19 @@ define float @v_log2_f32_nnan_ninf_daz(float %in) #0 { } define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { -; GFX689-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic: ; GFX689-GISEL: ; %bb.0: @@ -2382,14 +2866,44 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2448,19 +2962,19 @@ define float @v_log2_f32_fast_daz(float %in) #0 { } define float @v_log2_f32_dynamic_mode(float %in) #1 { -; GFX689-SDAG-LABEL: v_log2_f32_dynamic_mode: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX689-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX689-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX689-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] +; SI-SDAG-LABEL: v_log2_f32_dynamic_mode: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode: ; GFX689-GISEL: ; %bb.0: @@ -2476,14 +2990,44 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; VI-SDAG-LABEL: v_log2_f32_dynamic_mode: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-SDAG-LABEL: v_log2_f32_dynamic_mode: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2649,10 +3193,10 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2718,9 +3262,9 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0x800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; SI-NEXT: v_log_f32_e32 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2732,10 +3276,10 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_mov_b32 s4, 0x800000 -; VI-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-NEXT: v_log_f32_e32 v0, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2748,9 +3292,9 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: s_mov_b32 s4, 0x800000 ; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX900-NEXT: v_log_f32_e32 v0, v0 ; GFX900-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2763,10 +3307,11 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX1100-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index ca7f56d9ff34..33e34e38a183 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s -; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s -; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX678,GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX678,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s declare void @llvm.set.rounding(i32) declare i32 @llvm.get.rounding() diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-addc0.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-addc0.mir index 0f22be796845..37a77d658ffb 100644 --- a/llvm/test/CodeGen/AMDGPU/load-store-opt-addc0.mir +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-addc0.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-load-store-opt -o - %s | FileCheck --check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-load-store-opt -o - %s | FileCheck --check-prefix=GCN %s # This used to crash diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 13932b39ac1a..2a7553ae5d92 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -187,6 +187,7 @@ define amdgpu_kernel void @k01() { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_endpgm + call void @f0() call void @f1() ret void diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 33007e5b285d..3be17f9538d0 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1333,5 +1333,668 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 { ret i48 %a } +define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 { +; CI-LABEL: lshr_mad_i64_1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xfc19 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_1: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xfc19 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v3, v1, s4 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xfc19 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xfffffffffffffc19 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_2(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_2: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xd1 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_2: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xd1 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v3, v1, s4 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xd1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xffffffff000000d1 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_3(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_3: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xfc88 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_3: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xfc88 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v3, v1, s4 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xfc88 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 s0xfffffffffffffc88, %lsh + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { +; CI-LABEL: lshr_mad_i64_4: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_lo_u32 v3, v2, v0 +; CI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0 +; CI-NEXT: s_movk_i32 s4, 0xfc88 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_4: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_lo_u32 v2, v2, v0 +; SI-NEXT: v_mul_hi_u32 v3, v1, v0 +; SI-NEXT: s_movk_i32 s4, 0xfc88 +; SI-NEXT: v_mul_lo_u32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; SI-NEXT: v_mul_hi_u32 v3, v2, s4 +; SI-NEXT: v_mul_lo_u32 v1, v2, s4 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-NEXT: s_movk_i32 s4, 0xfc88 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5] +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %ext = zext i32 %arg0 to i64 + %mul1 = mul i64 %arg1, %ext + %lsh = lshr i64 %mul1, 32 + %mul2 = mul i64 %lsh, s0xfffffffffffffc88 + %mad = add i64 %mul2, %mul1 + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; CI-NEXT: s_movk_i32 s4, 0xfc19 +; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1] +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_1: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; SI-NEXT: s_movk_i32 s4, 0xfc19 +; SI-NEXT: v_mul_lo_u32 v3, v2, s4 +; SI-NEXT: v_mul_hi_i32 v2, v2, s4 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GFX9-NEXT: s_movk_i32 s4, 0xfc19 +; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: lshr_mad_i64_negative_1: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1] +; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1150-LABEL: lshr_mad_i64_negative_1: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1] +; GFX1150-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 36 + %mul = mul i64 %lsh, s0xfffffffffffffc19 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_2: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xd1 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v0 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_2: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xd1 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v4, v1, s4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xd1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_negative_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xffffff00000000d1 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_3: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22 +; CI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; CI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_3: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22 +; SI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_negative_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %op = add i64 %arg0, 1 + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xfffffffffffffc00 + %mad = add i64 %mul, %op + + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_4: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1] +; CI-NEXT: v_mul_lo_u32 v0, v1, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v0, v3 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_4: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_hi_u32 v2, v1, v0 +; SI-NEXT: v_mul_lo_u32 v3, v1, v1 +; SI-NEXT: v_mul_lo_u32 v4, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: lshr_mad_i64_negative_4: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1150-LABEL: lshr_mad_i64_negative_4: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mov_b32_e32 v0, v4 +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, v3 +; GFX1150-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, %arg0 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 { +; CI-LABEL: lshr_mad_i64_sgpr: +; CI: ; %bb.0: +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v2, 0xffff1c18 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1] +; CI-NEXT: v_subrev_i32_e32 v1, vcc, s1, v1 +; CI-NEXT: v_readfirstlane_b32 s0, v0 +; CI-NEXT: v_readfirstlane_b32 s1, v1 +; CI-NEXT: ; return to shader part epilog +; +; SI-LABEL: lshr_mad_i64_sgpr: +; SI: ; %bb.0: +; SI-NEXT: v_mov_b32_e32 v0, 0xffff1c18 +; SI-NEXT: v_mul_hi_u32 v0, s1, v0 +; SI-NEXT: s_mul_i32 s2, s1, 0xffff1c18 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_sub_i32 s3, s3, s1 +; SI-NEXT: s_add_u32 s0, s2, s0 +; SI-NEXT: s_addc_u32 s1, s3, s1 +; SI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: lshr_mad_i64_sgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18 +; GFX9-NEXT: s_sub_i32 s2, s2, s1 +; GFX9-NEXT: s_mul_i32 s3, s1, 0xffff1c18 +; GFX9-NEXT: s_add_u32 s0, s3, s0 +; GFX9-NEXT: s_addc_u32 s1, s2, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: lshr_mad_i64_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18 +; GFX11-NEXT: s_mul_i32 s3, s1, 0xffff1c18 +; GFX11-NEXT: s_sub_i32 s2, s2, s1 +; GFX11-NEXT: s_add_u32 s0, s3, s0 +; GFX11-NEXT: s_addc_u32 s1, s2, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: lshr_mad_i64_sgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, 0xffff1c18 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_mov_b32 s5, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-NEXT: ; return to shader part epilog + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xffffffffffff1c18 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 { +; CI-LABEL: lshr_mad_i64_vec: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, 0xffff1c18 +; CI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1] +; CI-NEXT: s_mov_b32 s4, 0xffff1118 +; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 +; CI-NEXT: v_mov_b32_e32 v0, v4 +; CI-NEXT: v_mov_b32_e32 v2, v6 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_vec: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffff1118 +; SI-NEXT: v_mul_lo_u32 v4, v3, s4 +; SI-NEXT: v_mul_hi_u32 v5, v3, s4 +; SI-NEXT: s_mov_b32 s4, 0xffff1c18 +; SI-NEXT: v_mul_hi_u32 v6, v1, s4 +; SI-NEXT: v_mul_lo_u32 v7, v1, s4 +; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_vec: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xffff1c18 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: s_mov_b32 s4, 0xffff1118 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3] +; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_vec: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, v7, v3 +; GFX11-NEXT: v_mov_b32_e32 v2, v6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_vec: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3 +; GFX12-NEXT: v_mov_b32_e32 v2, v6 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32> + %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118> + %mad = add <2 x i64> %mul, %arg0 + + ret <2 x i64> %mad +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir index 8be7308c8a6e..3feccff715bc 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir @@ -746,7 +746,7 @@ name: smfma4x4_write_vgpr_dot_write body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec ... # GCN-LABEL: name: smfma4x4_read_srcc_vgpr_valu_write # GCN: V_MFMA @@ -945,7 +945,7 @@ name: dot_write_vgpr_different_dot_read_srcc body: | bb.0: $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec - $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec ... # GCN-LABEL: name: dot_write_vgpr_different_dot_write # GCN: V_DOT @@ -955,7 +955,7 @@ name: dot_write_vgpr_different_dot_write body: | bb.0: $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec - $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec ... # GCN-LABEL: name: dot_write_vgpr_different_valu_read # GCN: V_DOT diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir index d59bcfb16eec..52891989b88f 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir @@ -1071,7 +1071,7 @@ name: xdl_smfma4x4_write_vgpr_dot_write body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_I32_4X4X4I8_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec - $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec ... # GCN-LABEL: name: nonxdl_smfma4x4_read_srcc_vgpr_valu_write # GCN: V_MFMA @@ -1265,7 +1265,7 @@ name: dot_write_vgpr_different_dot_read_srcc body: | bb.0: $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec - $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $mode, implicit $exec ... # GCN-LABEL: name: dot_write_vgpr_different_dot_write # GCN: V_DOT @@ -1275,7 +1275,7 @@ name: dot_write_vgpr_different_dot_write body: | bb.0: $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec - $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec ... # GCN-LABEL: name: dot_write_vgpr_different_valu_read # GCN: V_DOT diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index f68b84c7140b..433236180b13 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -# RUN: llc -march=amdgcn -mcpu=gfx950 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s # Immediate operand order = cbsz, abid, blgp diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll index 7f0f473c11bd..0460f83b5773 100644 --- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll +++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll @@ -1,14 +1,15 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s ; SPI_TMPRING_SIZE.WAVESIZE = 5 ; GFX10: .long 165608 ; GFX10-NEXT: .long 20480 ; SPI_TMPRING_SIZE.WAVESIZE = 17 -; GFX11: .long 165608 -; GFX11-NEXT: .long 69632 +; GFX11PLUS: .long 165608 +; GFX11PLUS-NEXT: .long 69632 ; GCN-LABEL: {{^}}scratch_ps: ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll new file mode 100644 index 000000000000..0f67a404972a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 { +; GFX11-LABEL: mixed_vmem_types: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_mov_b32 s0, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 +; GFX11-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-NEXT: buffer_load_b32 v4, off, s[40:43], 0 +; GFX11-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[24:27], 0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: mixed_vmem_types: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_mov_b32 s0, s3 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 +; GFX12-NEXT: s_mov_b32 s3, s5 +; GFX12-NEXT: s_mov_b32 s1, s5 +; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[20:23], null +; GFX12-NEXT: buffer_load_b32 v2, off, s[16:19], null +; GFX12-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-NEXT: buffer_load_b32 v4, off, s[40:43], null +; GFX12-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX12-NEXT: s_wait_samplecnt 0x1 +; GFX12-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_and_b32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, s0, s2 +; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-NEXT: buffer_store_b32 v0, off, s[24:27], null +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mixed_vmem_types: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_getpc_b64 s[20:21] +; GFX12-GISEL-NEXT: s_mov_b32 s0, s3 +; GFX12-GISEL-NEXT: s_sext_i32_i16 s21, s21 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 +; GFX12-GISEL-NEXT: s_mov_b32 s1, s21 +; GFX12-GISEL-NEXT: s_mov_b32 s3, s21 +; GFX12-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null +; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null +; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null +; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2 +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 +; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1 +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s2 +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-GISEL-NEXT: buffer_store_b32 v0, off, s[24:27], null +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %i = call i64 @llvm.amdgcn.s.getpc() + %extelt.offset = lshr i64 %i, 32 + %.i1 = trunc i64 %extelt.offset to i32 + %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0 + %i1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1 + %i2 = bitcast <2 x i32> %i1 to i64 + %i3 = inttoptr i64 %i2 to ptr addrspace(4) + %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0 + %i4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1 + %i5 = bitcast <2 x i32> %i4 to i64 + %i6 = inttoptr i64 %i5 to ptr addrspace(4) + %i7 = getelementptr i8, ptr addrspace(4) %i6, i64 80 + %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16 + %i9 = getelementptr i8, ptr addrspace(4) %i3, i64 48 + %i10 = load <4 x i32>, ptr addrspace(4) %i9, align 16 + %i11 = getelementptr i8, ptr addrspace(4) %i6, i64 64 + %i12 = load <4 x i32>, ptr addrspace(4) %i11, align 16 + %i13 = getelementptr i8, ptr addrspace(4) %i6, i64 16 + %i14 = load <4 x i32>, ptr addrspace(4) %i13, align 16 + %i15 = getelementptr i8, ptr addrspace(4) %i6, i64 32 + %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32 + %i17 = load <4 x i32>, ptr addrspace(4) %i6, align 16 + %i18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i16, <4 x i32> %i17, i1 false, i32 0, i32 0) + %i19 = fcmp oeq float %i18, 0.000000e+00 + %i20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i14, i32 0, i32 0, i32 0) + %.not = icmp eq i32 %i20, 2752 + %i21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i12, i32 0, i32 0, i32 0) + %.not1 = icmp eq i32 %i21, 2752 + %i22 = getelementptr i8, ptr addrspace(4) %i3, i64 16 + %i23 = load <8 x i32>, ptr addrspace(4) %i22, align 32 + %i24 = load <4 x i32>, ptr addrspace(4) %i3, align 16 + %i25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i23, <4 x i32> %i24, i1 false, i32 0, i32 0) + %i26 = fcmp oeq float %i25, 1.000000e+00 + %i27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i10, i32 0, i32 0, i32 0) + %.not2 = icmp eq i32 %i27, 2752 + %i28 = select i1 %.not2, i1 %i26, i1 false + %i29 = select i1 %i28, i1 %.not1, i1 false + %i30 = select i1 %i29, i1 %.not, i1 false + %narrow2 = select i1 %i30, i1 %i19, i1 false + %.4 = zext i1 %narrow2 to i32 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %i8, i32 0, i32 0, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll index 0167fcbc4ab7..39650f4295c7 100644 --- a/llvm/test/CodeGen/AMDGPU/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/mmra.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s declare void @readsMem(ptr) #0 declare void @writesMem(ptr) #1 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 85096eb63f46..2bd60e869f84 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: s_cmp_lg_u32 s9, 0 ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3 ; MUBUF-NEXT: ; %bb.2: ; %bb.1 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 -; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 -; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_mov_b32 s6, s32 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 -; MUBUF-NEXT: v_mov_b32_e32 v2, s6 -; MUBUF-NEXT: v_mov_b32_e32 v3, 1 +; MUBUF-NEXT: v_mov_b32_e32 v2, 1 +; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 +; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4 ; MUBUF-NEXT: s_add_i32 s6, s6, s7 -; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_mov_b32 s2, s32 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 ; FLATSCR-NEXT: s_add_i32 s2, s2, s3 ; FLATSCR-NEXT: scratch_load_dword v2, off, s2 @@ -131,10 +130,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: s_cmp_lg_u32 s4, 0 ; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 -; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000 +; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff ; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000 ; MUBUF-NEXT: s_lshl_b32 s5, s5, 2 -; MUBUF-NEXT: s_mov_b32 s32, s4 +; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, s4 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1 @@ -165,12 +164,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000 +; FLATSCR-NEXT: s_add_i32 s0, s32, 0xfff ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s0 +; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 ; FLATSCR-NEXT: s_add_i32 s0, s0, s1 ; FLATSCR-NEXT: scratch_load_dword v2, off, s0 @@ -230,16 +229,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; MUBUF-NEXT: s_and_b64 exec, exec, vcc ; MUBUF-NEXT: s_cbranch_execz .LBB2_3 ; MUBUF-NEXT: ; %bb.2: ; %bb.1 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_mov_b32 s6, s32 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 -; MUBUF-NEXT: v_mov_b32_e32 v3, s6 -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4 ; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off @@ -266,14 +264,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; FLATSCR-NEXT: s_and_b64 exec, exec, vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_mov_b32 s2, s32 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off @@ -324,7 +322,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc ; MUBUF-NEXT: s_cbranch_execz .LBB3_2 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_add_i32 s6, s32, 0xfff ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, s6 @@ -334,7 +332,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off @@ -358,7 +356,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0xfff ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v5, 1 @@ -366,7 +364,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 37d0309caac0..b40d35dbd8ac 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -20,8 +20,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 5b8acc31b22c..0c6d8dce193d 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -599,10 +599,8 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. ; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-NEXT: ; %bb.0: -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x1c +; GFX940-NEXT: s_lshr_b32 s0, s15, 16 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s0, s0, 16 ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 ; GFX940-NEXT: s_endpgm @@ -626,4 +624,74 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ret void } +; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs. + +define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 { +; GFX940-LABEL: preload_block_max_user_sgprs: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, s12 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_block_max_user_sgprs: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28 +; GFX90a-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 { +; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: +; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-NEXT: ; %bb.0: +; GFX940-NEXT: s_lshr_b32 s0, s9, 16 +; GFX940-NEXT: s_and_b32 s1, s8, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: +; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-NEXT: ; %bb.0: +; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 +; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff +; GFX90a-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NEXT: v_mov_b32_e32 v2, s0 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: s_endpgm + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 + %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load0 = load i32, ptr addrspace(4) %gep0 + %load1 = load i16, ptr addrspace(4) %gep1 + %load2 = load i16, ptr addrspace(4) %gep2 + %conv1 = zext i16 %load1 to i32 + %conv2 = zext i16 %load2 to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll index d070dc3b770f..1afd31c6d45e 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel_128_512() #5 { define amdgpu_kernel void @kernel_512_512() #6 { ; CHECK-LABEL: define {{[^@]+}}@kernel_512_512 -; CHECK-SAME: () #[[ATTR5]] { +; CHECK-SAME: () #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: call void @default_to_128_512() ; CHECK-NEXT: call void @flat_group_512_1024() ; CHECK-NEXT: ret void @@ -111,7 +111,7 @@ define amdgpu_kernel void @kernel_512_512() #6 { ; Called from kernels with 128,256 and 64,128 => 64,256 define internal void @default_to_64_256() { ; CHECK-LABEL: define {{[^@]+}}@default_to_64_256 -; CHECK-SAME: () #[[ATTR6:[0-9]+]] { +; CHECK-SAME: () #[[ATTR7:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -121,7 +121,7 @@ define internal void @default_to_64_256() { ; this should probably be illegal. define amdgpu_kernel void @kernel_128_256() #3 { ; CHECK-LABEL: define {{[^@]+}}@kernel_128_256 -; CHECK-SAME: () #[[ATTR7:[0-9]+]] { +; CHECK-SAME: () #[[ATTR4]] { ; CHECK-NEXT: call void @default_to_64_256() ; CHECK-NEXT: ret void ; @@ -153,7 +153,7 @@ define internal void @merge_cycle_1() #3 { define amdgpu_kernel void @kernel_64_256() #7 { ; CHECK-LABEL: define {{[^@]+}}@kernel_64_256 -; CHECK-SAME: () #[[ATTR6]] { +; CHECK-SAME: () #[[ATTR7]] { ; CHECK-NEXT: call void @merge_cycle_0() ; CHECK-NEXT: call void @default_captured_address() ; CHECK-NEXT: call void @externally_visible_default() @@ -188,7 +188,7 @@ define void @externally_visible_default() { ; 1,1024 -> 64,256 define internal i32 @bitcasted_function() { ; CHECK-LABEL: define {{[^@]+}}@bitcasted_function -; CHECK-SAME: () #[[ATTR6]] { +; CHECK-SAME: () #[[ATTR7]] { ; CHECK-NEXT: ret i32 0 ; ret i32 0 @@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" } attributes #6 = { "amdgpu-flat-work-group-size"="512,512" } attributes #7 = { "amdgpu-flat-work-group-size"="64,256" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index f62f1d57aec8..6a909f52082d 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -117,7 +117,7 @@ define amdgpu_kernel void @kernel_2_9() #6 { define amdgpu_kernel void @kernel_9_9() #7 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_9_9 -; CHECK-SAME: () #[[ATTR6]] { +; CHECK-SAME: () #[[ATTR7:[0-9]+]] { ; CHECK-NEXT: call void @default_to_2_9() ; CHECK-NEXT: call void @flat_group_9_10() ; CHECK-NEXT: ret void @@ -140,7 +140,7 @@ define internal void @default_to_1_8_b() { ; this should probably be illegal. define amdgpu_kernel void @kernel_2_8() #4 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2_8 -; CHECK-SAME: () #[[ATTR7:[0-9]+]] { +; CHECK-SAME: () #[[ATTR5]] { ; CHECK-NEXT: call void @default_to_1_8_a() ; CHECK-NEXT: call void @default_to_1_8_b() ; CHECK-NEXT: ret void @@ -153,7 +153,7 @@ define amdgpu_kernel void @kernel_2_8() #4 { ; 1,2 -> 2,2 define internal void @merge_cycle_0() #1 { ; CHECK-LABEL: define internal void @merge_cycle_0 -; CHECK-SAME: () #[[ATTR5]] { +; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: call void @merge_cycle_1() ; CHECK-NEXT: ret void ; @@ -165,7 +165,7 @@ define internal void @merge_cycle_0() #1 { ; 2,8 -> 2,8 define internal void @merge_cycle_1() #4 { ; CHECK-LABEL: define internal void @merge_cycle_1 -; CHECK-SAME: () #[[ATTR7]] { +; CHECK-SAME: () #[[ATTR5]] { ; CHECK-NEXT: call void @merge_cycle_0() ; CHECK-NEXT: ret void ; @@ -235,7 +235,7 @@ define internal void @called_from_invalid_bounds_1() { ; Invalid range for amdgpu-waves-per-eu define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8 -; CHECK-SAME: () #[[ATTR11:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: call void @called_from_invalid_bounds_0() ; CHECK-NEXT: ret void ; @@ -246,7 +246,7 @@ define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { ; Invalid range for amdgpu-waves-per-eu define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123 -; CHECK-SAME: () #[[ATTR12:[0-9]+]] { +; CHECK-SAME: () #[[ATTR11:[0-9]+]] { ; CHECK-NEXT: call void @called_from_invalid_bounds_1() ; CHECK-NEXT: ret void ; @@ -260,7 +260,7 @@ define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 { ; -> 2,10 define void @larger_group_size_implies_lower_minimum() #11 { ; CHECK-LABEL: define void @larger_group_size_implies_lower_minimum -; CHECK-SAME: () #[[ATTR13:[0-9]+]] { +; CHECK-SAME: () #[[ATTR12:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -268,7 +268,7 @@ define void @larger_group_size_implies_lower_minimum() #11 { define amdgpu_kernel void @kernel_3_6() #12 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3_6 -; CHECK-SAME: () #[[ATTR14:[0-9]+]] { +; CHECK-SAME: () #[[ATTR13:[0-9]+]] { ; CHECK-NEXT: call void @larger_group_size_implies_lower_minimum() ; CHECK-NEXT: ret void ; @@ -279,7 +279,7 @@ define amdgpu_kernel void @kernel_3_6() #12 { ; 3,6 -> 6,9 define internal void @refine_upper_func_3_6() #13 { ; CHECK-LABEL: define internal void @refine_upper_func_3_6 -; CHECK-SAME: () #[[ATTR15:[0-9]+]] { +; CHECK-SAME: () #[[ATTR14:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -288,7 +288,7 @@ define internal void @refine_upper_func_3_6() #13 { ; 4,8 -> 6,8 define internal void @refine_lower_func_4_8() #14 { ; CHECK-LABEL: define internal void @refine_lower_func_4_8 -; CHECK-SAME: () #[[ATTR16:[0-9]+]] { +; CHECK-SAME: () #[[ATTR15:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: ret void ; @@ -298,7 +298,7 @@ define internal void @refine_lower_func_4_8() #14 { define amdgpu_kernel void @kernel_foo_6_8() #15 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8 -; CHECK-SAME: () #[[ATTR16]] { +; CHECK-SAME: () #[[ATTR16:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @refine_lower_func_4_8() ; CHECK-NEXT: call void @func_9_10_a() @@ -340,7 +340,7 @@ define internal void @func_9_10_a() #18 { ; 9,10 -> 9,9 define internal void @func_9_10_b() #18 { ; CHECK-LABEL: define internal void @func_9_10_b -; CHECK-SAME: () #[[ATTR20:[0-9]+]] { +; CHECK-SAME: () #[[ATTR19]] { ; CHECK-NEXT: ret void ; ret void @@ -348,7 +348,7 @@ define internal void @func_9_10_b() #18 { define amdgpu_kernel void @kernel_bar_8_9() #19 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9 -; CHECK-SAME: () #[[ATTR21:[0-9]+]] { +; CHECK-SAME: () #[[ATTR20:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @func_5_5() ; CHECK-NEXT: call void @func_9_10_b() @@ -399,26 +399,25 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" } attributes #18 = { "amdgpu-waves-per-eu"="9,10" } attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR14]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index 6b097bd71c9f..ba428df273db 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -3,20 +3,32 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_cs float @v_s_exp_f32(float inreg %src) { -; GFX12-LABEL: v_s_exp_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 -; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_add_f32 s0, s0, s1 -; GFX12-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0 -; GFX12-NEXT: v_s_exp_f32 s0, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: v_s_exp_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42800000, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-SDAG-NEXT: s_add_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0xffffffc0, 0 +; GFX12-SDAG-NEXT: v_s_exp_f32 s0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, s1 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: v_s_exp_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42800000, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_add_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0 +; GFX12-GISEL-NEXT: v_s_exp_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.exp2.f32(float %src) ret float %result } @@ -55,20 +67,38 @@ define amdgpu_cs half @v_s_amdgcn_exp_f16(half inreg %src) { } define amdgpu_cs float @v_s_log_f32(float inreg %src) { -; GFX12-LABEL: v_s_log_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-NEXT: v_s_log_f32 s0, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-NEXT: s_sub_f32 s0, s0, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: v_s_log_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo +; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: v_s_log_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.log2.f32(float %src) ret float %result } @@ -271,22 +301,41 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src) { } define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { -; GFX12-LABEL: srcmods_abs_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_bitset0_b32 s0, 31 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX12-NEXT: v_s_log_f32 s0, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_sub_f32 s0, s0, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-SDAG-LABEL: srcmods_abs_f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_and_b32 s1, s0, 0x7fffffff +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: s_cmp_lt_f32 s1, 0x800000 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX12-SDAG-NEXT: v_ldexp_f32 v0, |s0|, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo +; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: srcmods_abs_f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_bitset0_b32 s0, 31 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 +; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: ; return to shader part epilog %abs = call float @llvm.fabs.f32(float %src) %result = call float @llvm.log2.f32(float %abs) ret float %result @@ -295,18 +344,20 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; GFX12-SDAG-LABEL: srcmods_neg_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_xor_b32 s1, s0, 0x80000000 ; GFX12-SDAG-NEXT: s_cmp_gt_f32 s0, 0x80800000 -; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-SDAG-NEXT: s_mul_f32 s0, s1, s0 -; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0 +; GFX12-SDAG-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_ldexp_f32 v0, -s0, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-SDAG-NEXT: s_and_b32 s0, s1, exec_lo +; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GFX12-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-SDAG-NEXT: v_subrev_f32_e32 v0, s0, v0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: srcmods_neg_f32: diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll b/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll new file mode 100644 index 000000000000..388a8e804a88 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-error-all-regs-reserved.ll @@ -0,0 +1,33 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=greedy -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=basic -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -vgpr-regalloc=fast -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s + +; FIXME: Should pass verifier after failure. + +declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32 immarg, i32 immarg, i32 immarg) + +; CHECK: error: <unknown>:0:0: no registers from class available to allocate in function 'no_registers_from_class_available_to_allocate' +define <32 x i32> @no_registers_from_class_available_to_allocate(<32 x i32> %arg) #0 { + %ret = call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %arg, i32 1, i32 2, i32 3) + ret <32 x i32> %ret +} + +; CHECK: error: <unknown>:0:0: no registers from class available to allocate in function 'no_registers_from_class_available_to_allocate_asm_use' +define void @no_registers_from_class_available_to_allocate_asm_use(<32 x i32> %arg) #0 { + call void asm sideeffect "; use $0", "v"(<32 x i32> %arg) + ret void +} + +; CHECK: error: <unknown>:0:0: no registers from class available to allocate in function 'no_registers_from_class_available_to_allocate_asm_def' +define <32 x i32> @no_registers_from_class_available_to_allocate_asm_def() #0 { + %ret = call <32 x i32> asm sideeffect "; def $0", "=v"() + ret <32 x i32> %ret +} + +; CHECK: error: <unknown>:0:0: no registers from class available to allocate in function 'no_registers_from_class_available_to_allocate_undef_asm' +define void @no_registers_from_class_available_to_allocate_undef_asm() #0 { + call void asm sideeffect "; use $0", "v"(<32 x i32> poison) + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="10,10" } diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-errors.ll b/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-errors.ll new file mode 100644 index 000000000000..bd1752d21507 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-registers-errors.ll @@ -0,0 +1,63 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -stress-regalloc=1 -vgpr-regalloc=greedy -filetype=null %s 2>&1 | FileCheck -check-prefixes=CHECK,GREEDY -implicit-check-not=error %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -stress-regalloc=1 -vgpr-regalloc=basic -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error -check-prefixes=CHECK,BASIC %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -stress-regalloc=1 -vgpr-regalloc=fast -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error -check-prefixes=CHECK,FAST %s +; RUN: opt -passes=debugify -o %t.bc %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -stress-regalloc=1 -vgpr-regalloc=greedy -filetype=null %t.bc 2>&1 | FileCheck -implicit-check-not=error -check-prefixes=DBGINFO-CHECK,DBGINFO-GREEDY %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -stress-regalloc=1 -vgpr-regalloc=basic -filetype=null %t.bc 2>&1 | FileCheck -implicit-check-not=error -check-prefixes=DBGINFO-CHECK,DBGINFO-BASIC %s + +; FIXME: Asserts when using -O2 + -vgpr-regalloc=fast +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stress-regalloc=1 -O0 -filetype=null %t.bc 2>&1 | FileCheck -implicit-check-not=error -check-prefixes=DBGINFO-CHECK,DBGINFO-FAST %s + +; TODO: Should we fix emitting multiple errors sometimes in basic and fast? + + +; CHECK: error: <unknown>:0:0: ran out of registers during register allocation in function 'ran_out_of_registers_general' + +; DBGINFO-GREEDY: error: {{.*}}:3:1: ran out of registers during register allocation in function 'ran_out_of_registers_general' + +; DBGINFO-BASIC: error: {{.*}}:1:1: ran out of registers during register allocation in function 'ran_out_of_registers_general' + +; DBGINFO-FAST: error: {{.*}}:3:1: ran out of registers during register allocation in function 'ran_out_of_registers_general' +define i32 @ran_out_of_registers_general(ptr addrspace(1) %ptr) #0 { + %ld0 = load volatile i32, ptr addrspace(1) %ptr + %ld1 = load volatile i32, ptr addrspace(1) %ptr + %add = add i32 %ld0, %ld1 + ret i32 %add +} + +; CHECK: error: inline assembly requires more registers than available at line 23 +; DBGINFO-CHECK: error: inline assembly requires more registers than available at line 23 +define void @ran_out_of_registers_asm_def() #0 { + %asm = call { i32, i32 } asm sideeffect "; def $0 $1", "=v,=v"(), !srcloc !0 + ret void +} + +; CHECK: error: inline assembly requires more registers than available at line 23 +; DBGINFO-CHECK: error: inline assembly requires more registers than available at line 23 +define void @ran_out_of_registers_asm_use() #0 { + call void asm sideeffect "; def $0 $1", "v,v"(i32 0, i32 1), !srcloc !0 + ret void +} + +; Test error in anonymous function. + +; GREEDY: error: inline assembly requires more registers than available at line 23 +; BASIC: error: inline assembly requires more registers than available at line 23 + +; FAST: error: <unknown>:0:0: ran out of registers during register allocation in function '@0' + +; DBGINFO-GREEDY: error: inline assembly requires more registers than available at line 23 +; DBGINFO-BASIC: error: inline assembly requires more registers than available at line 23 + +; DBGINFO-FAST: error: {{.*}}:12:1: ran out of registers during register allocation in function '@0' +define i32 @0(ptr addrspace(1) %ptr) #0 { + %asm = call { i32, i32 } asm sideeffect "; def $0 $1 use $2", "=v,=v,v"(ptr addrspace(1) %ptr), !srcloc !0 + %elt0 = extractvalue { i32, i32 } %asm, 0 + %elt1 = extractvalue { i32, i32 } %asm, 1 + %add = add i32 %elt0, %elt1 + ret i32 %add +} + +attributes #0 = { "target-cpu"="gfx908" } + +!0 = !{i32 23} diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll index c1d647c5d3b9..72ae3966416f 100644 --- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll @@ -7,7 +7,7 @@ @global.2 = internal addrspace(1) global %struct.foo { %struct.pluto zeroinitializer, ptr addrspacecast (ptr addrspace(1) @global.2 to ptr), i64 0 } ;. -; CHECK: @[[GLOBAL_2:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global [[STRUCT_FOO:%.*]] { [[STRUCT_PLUTO:%.*]] zeroinitializer, ptr addrspacecast (ptr addrspace(1) @global.2 to ptr), i64 0 } +; CHECK: @global.2 = internal addrspace(1) global %struct.foo { %struct.pluto zeroinitializer, ptr addrspacecast (ptr addrspace(1) @global.2 to ptr), i64 0 } ;. define void @hoge() { ; CHECK-LABEL: define void @hoge @@ -19,5 +19,5 @@ define void @hoge() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll index 1b6e88524e96..45ca0d4e156b 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll @@ -1,11 +1,10 @@ -; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -o - %s 2>%t.err | FileCheck %s +; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -o - %s 2>%t.err | FileCheck -implicit-check-not=error %s ; RUN: FileCheck -check-prefix=ERR %s < %t.err ; This testcase would fail on an "illegal eviction". If the assert was ; relaxed to allow equivalent cascade numbers, it would infinite loop. ; ERR: error: inline assembly requires more registers than available -; ERR: error: inline assembly requires more registers than available %asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> } diff --git a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll index 04e995b6f343..8e3054cceb85 100644 --- a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll @@ -10,7 +10,7 @@ ; This happens due to when register allocator is out of registers ; it takes the first avialable register. -; CHECK: error: ran out of registers during register allocation +; CHECK: error: <unknown>:0:0: ran out of registers during register allocation ; CHECK: Bad machine code: Using an undefined physical register define amdgpu_kernel void @alloc_failure_with_split_vregs(float %v0, float %v1) #0 { %agpr0 = call float asm sideeffect "; def $0", "=${a0}"() diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 4420833029d4..2850612d7008 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) { define amdgpu_kernel void @kernel_lds_recursion() { ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion( -; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] { +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META9:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ] ; CHECK-NEXT: call void @mutual_recursion_0(i16 0) ; CHECK-NEXT: ret void @@ -191,13 +191,14 @@ define amdgpu_kernel void @kernel_lds_recursion() { !1 = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[META0]] = !{i32 0, i32 1} ; CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/remove-not-short-exec-branch-on-unconditional-jump.mir b/llvm/test/CodeGen/AMDGPU/remove-not-short-exec-branch-on-unconditional-jump.mir new file mode 100644 index 000000000000..f45f48434fa2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remove-not-short-exec-branch-on-unconditional-jump.mir @@ -0,0 +1,90 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass=si-pre-emit-peephole %s -o - | FileCheck %s +# Do no remove S_CBRANCH_EXECZ if the following block contains an unconditional +# branch to a block other than the one immediately following it. + +--- +name: test +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; CHECK-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr1, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $vgpr1, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr1, $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: liveins: $vgpr1, $sgpr0_sgpr1, $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: liveins: $vgpr1, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc + ; CHECK-NEXT: renamable $vgpr0 = V_CVT_F32_U32_e32 killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0_sgpr1 = S_MOV_B64 $exec + V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.5, implicit $exec + + bb.1: + liveins: $vgpr1, $sgpr0_sgpr1 + + renamable $sgpr2_sgpr3 = IMPLICIT_DEF + renamable $sgpr4_sgpr5 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.2: + liveins: $vgpr1, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 + + $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc + S_CBRANCH_EXECZ %bb.4, implicit $exec + + bb.3: + liveins: $vgpr1, $sgpr0_sgpr1, $sgpr2_sgpr3 + + renamable $sgpr4_sgpr5 = IMPLICIT_DEF + S_BRANCH %bb.1 + + bb.4: + liveins: $vgpr1, $sgpr0_sgpr1, $sgpr2_sgpr3 + + $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc + + bb.5: + liveins: $vgpr1, $sgpr0_sgpr1 + + $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc + renamable $vgpr0 = V_CVT_F32_U32_e32 killed $vgpr1, implicit $mode, implicit $exec + SI_RETURN_TO_EPILOG killed $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll index 8c584a1890c9..98b701ab7f9d 100644 --- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll @@ -339,7 +339,7 @@ define amdgpu_kernel void @all_local_size(ptr addrspace(1) nocapture readnone %o ; TODO: Should be able to handle this, but not much reason to. ; CHECK-LABEL: @partial_load_group_size_x( ; CHECK-NEXT: %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 +; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds nuw i8, ptr addrspace(4) %dispatch.ptr, i64 4 ; CHECK-NEXT: %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 4 ; CHECK-NEXT: store i8 %group.size.x.lo, ptr addrspace(1) %out, align 1 define amdgpu_kernel void @partial_load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { @@ -352,7 +352,7 @@ define amdgpu_kernel void @partial_load_group_size_x(ptr addrspace(1) %out) #0 ! ; CHECK-LABEL: @partial_load_group_size_x_explicit_callsite_align( ; CHECK-NEXT: %dispatch.ptr = tail call align 2 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 +; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds nuw i8, ptr addrspace(4) %dispatch.ptr, i64 4 ; CHECK-NEXT: %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 2 ; CHECK-NEXT: store i8 %group.size.x.lo, ptr addrspace(1) %out, align 1 define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { @@ -394,7 +394,7 @@ define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(ptr addrspa ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size( ; CHECK-NEXT: %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 +; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds nuw i8, ptr addrspace(4) %dispatch.ptr, i64 4 ; CHECK-NEXT: %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4 ; CHECK: %group.size.x.zext = zext i16 %group.size.x to i32 ; CHECK: store i64 %zext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index f3c9a5c471ac..f4b947ade8da 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -2006,14 +2006,12 @@ define float @v_rsq_f32(float %val) { ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4b800000 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x45800000 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 @@ -2296,10 +2294,9 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) { ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc @@ -2331,10 +2328,9 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float % ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc @@ -2366,14 +2362,12 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float % ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 @@ -2416,14 +2410,12 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) { ; GCN-IEEE-SAFE: ; %bb.0: ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4b800000 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x45800000 -; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 %div = fdiv contract float 1.0, %sqrt, !fpmath !1 diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll index 5e6849ec61b4..4865290fd51d 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck %s define amdgpu_vs float @sitofp_i32_to_f32(i32 inreg %val) { ; CHECK-LABEL: sitofp_i32_to_f32: diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll index cf73803f8929..81d792183dc0 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s define amdgpu_vs float @fadd_f32(float inreg %a, float inreg %b) { ; CHECK-LABEL: fadd_f32: @@ -36,23 +38,37 @@ define amdgpu_vs float @fmul_f32(float inreg %a, float inreg %b) { } define amdgpu_vs float @fmin_f32(float inreg %a, float inreg %b) { -; CHECK-LABEL: fmin_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_min_f32 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmin_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_min_f32 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmin_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_min_num_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) ret float %min } define amdgpu_vs float @fmax_f32(float inreg %a, float inreg %b) { -; CHECK-LABEL: fmax_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_max_f32 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmax_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_max_f32 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmax_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_max_num_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) ret float %max } @@ -91,23 +107,37 @@ define amdgpu_vs half @fmul_f16(half inreg %a, half inreg %b) { } define amdgpu_vs half @fmin_f16(half inreg %a, half inreg %b) { -; CHECK-LABEL: fmin_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_min_f16 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmin_f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_min_f16 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmin_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_min_num_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) ret half %min } define amdgpu_vs half @fmax_f16(half inreg %a, half inreg %b) { -; CHECK-LABEL: fmax_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_max_f16 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmax_f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_max_f16 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmax_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_max_num_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) ret half %max } @@ -179,19 +209,33 @@ define amdgpu_vs half @fmac_f16_with_mov(half inreg %a, half inreg %b, half inre ; Regression test for crash in SIFoldOperands define amdgpu_ps float @_amdgpu_ps_main() { -; CHECK-LABEL: _amdgpu_ps_main: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s3, s0 -; CHECK-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: _amdgpu_ps_main: +; GFX1150: ; %bb.0: ; %bb +; GFX1150-NEXT: s_mov_b32 s0, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_mov_b32 s1, s0 +; GFX1150-NEXT: s_mov_b32 s2, s0 +; GFX1150-NEXT: s_mov_b32 s3, s0 +; GFX1150-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: _amdgpu_ps_main: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog bb: %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0) %i1 = bitcast i32 %i to float diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll index ca508eb40017..3d283d6b1850 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) { ; SDAG-LABEL: f32_olt: diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir b/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir index 4b6e204ecf95..c2cd4653bc9b 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-sched-strategy=max-ilp -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s --- name: max-ilp-liveness-tracking diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll index 11602b1d353f..350ff94373a7 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=max-ilp -verify-machineinstrs < %s | FileCheck %s ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir index 774785fb3966..d352e8a13da9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -54,3 +54,45 @@ body: | SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... + +--- +name: sgpr_spill_s32_undef +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +body: | + bb.0: + ; CHECK-LABEL: name: sgpr_spill_s32_undef + ; CHECK: body: + ; CHECK-NEXT: bb.0: + ; CHECK-NOT: {{.+}} + ; CHECK: ... + SI_SPILL_S32_SAVE undef $sgpr8, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s32) into %stack.0, align 4, addrspace 5) + +... + +--- +name: sgpr_spill_s64_undef +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' +stack: + - { id: 0, type: spill-slot, size: 8, alignment: 4, stack-id: sgpr-spill } +body: | + bb.0: + ; CHECK-LABEL: name: sgpr_spill_s64_undef + ; CHECK: body: + ; CHECK-NEXT: bb.0: + ; CHECK-NOT: {{.+}} + ; CHECK: ... + SI_SPILL_S64_SAVE undef $sgpr8_sgpr9, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) + +... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber-unhandled.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber-unhandled.mir index 1957b524fa4f..d7155f8b40f5 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber-unhandled.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber-unhandled.mir @@ -5,8 +5,8 @@ # it. The save exec path clobbers SCC, so we currently don't have a # path which satisfies both these constraints. -# CHECK: error: unhandled SGPR spill to memory -# CHECK: error: unhandled SGPR spill to memory +# CHECK: error: <unknown>:0:0: in function sgpr32_save_clobber_scc_no_sgprs void (): unhandled SGPR spill to memory +# CHECK: error: <unknown>:0:0: in function sgpr32_save_clobber_scc_no_sgprs void (): unhandled SGPR spill to memory # CHECK: *** Bad machine code: Using an undefined physical register *** # CHECK: - instruction: S_CBRANCH_SCC1 %bb.2, implicit $scc # CHECK-NEXT: - operand 1: implicit $scc diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll index 3902c6dd422f..b2006d1a1f30 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll @@ -100,14 +100,14 @@ entry: } ;. -; NO: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; NO: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. -; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; NO: [[META0]] = !{ptr @bar1, ptr @bar2} ;. diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 894ab4fde7da..59d7fe107ee5 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -81,7 +81,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll index c8e3f97f6151..4993df7e1ba4 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -826,5 +826,5 @@ entry: ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind } -; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "uniform-work-group-size"="false" } attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index cda4c085cd25..245df6684384 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -240,9 +240,11 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -344,8 +346,9 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir index c825674de765..b02b6e79d7a7 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir @@ -71,3 +71,37 @@ body: | ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store (s32) into %stack.0 + 4, addrspace 5) SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) ... + +--- +name: spill_a32_undef +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + ; CHECK-LABEL: name: spill_a32_undef + ; CHECK: S_ENDPGM 0 + SI_SPILL_A32_SAVE undef $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + S_ENDPGM 0 +... + +--- +name: spill_a64_undef +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 8, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + ; CHECK-LABEL: name: spill_a64_undef + ; CHECK: S_ENDPGM 0 + SI_SPILL_A64_SAVE undef $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll b/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll new file mode 100644 index 000000000000..c5763c68e1da --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll @@ -0,0 +1,26 @@ +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-SDAG +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-GISEL +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-SDAG +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-GISEL + +; GCN-LABEL: name: buffer_swizzle_bit_pregfx12 +; PREGFX12-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 1, implicit $exec +; PREGFX12-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 1, implicit $exec +; GFX12PLUS-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN killed {{%[0-9]+}}, killed {{%[0-9]+}}, $sgpr_null, 0, 8, 0, implicit $exec +; GFX12PLUS-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, $sgpr_null, 0, 8, 0, implicit $exec +define amdgpu_ps <4 x float> @buffer_swizzle_bit_pregfx12(<4 x i32> inreg %0) { + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 8) + ret <4 x float> %data +} + +; GCN-LABEL: name: buffer_swizzle_bit_gfx12plus +; PREGFX12-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 0, implicit $exec +; PREGFX12-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 0, implicit $exec +; GFX12PLUS-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN killed {{%[0-9]+}}, killed {{%[0-9]+}}, $sgpr_null, 0, 0, 1, implicit $exec +; GFX12PLUS-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, $sgpr_null, 0, 0, 1, implicit $exec +define amdgpu_ps <4 x float> @buffer_swizzle_bit_gfx12plus(<4 x i32> inreg %0) { + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 64) + ret <4 x float> %data +} + +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll index b714fda6f1d0..2775de29368f 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_uaddsat_i8: @@ -36,6 +38,28 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-NEXT: v_min_u16 v0, 0xff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_uaddsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -67,6 +91,20 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_uaddsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uaddsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -97,6 +135,12 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -136,6 +180,12 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -184,6 +234,13 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -238,6 +295,13 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -275,6 +339,13 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -317,6 +388,14 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result } @@ -365,6 +444,15 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result } @@ -437,6 +525,19 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v8 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v9 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v10 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v11 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v12 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v13 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v14 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v15 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) ret <8 x i32> %result } @@ -565,6 +666,29 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp +; GFX11-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -610,6 +734,17 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uaddsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index a77e3c226ad2..db7d816386a7 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -716,199 +716,66 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s0, 0xff000000 -; GCN-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NEXT: s_and_b32 s6, s6, 0xff000000 -; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 24 -; GCN-NEXT: v_mac_f32_e32 v1, 0, v2 -; GCN-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-NEXT: s_sub_u32 s8, 0, s0 -; GCN-NEXT: s_subb_u32 s9, 0, s1 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GCN-NEXT: s_and_b32 s2, s2, 0xff000000 +; GCN-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 +; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 -; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 -; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v6 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 -; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, s8, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24 -; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 -; GCN-NEXT: v_mul_hi_u32 v1, v3, v1 -; GCN-NEXT: v_mul_hi_u32 v2, v3, v2 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 2, v1 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 -; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0 -; GCN-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v6, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v0 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-NEXT: buffer_store_short v3, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_i48: -; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: s_and_b32 s0, s0, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-IR-NEXT: s_and_b32 s2, s2, 0xff000000 -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[0:1], 24 -; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 -; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[0:1] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[2:3], s[6:7] -; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[8:9] -; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[6:7], s[14:15] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[14:15], exec -; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9 -; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s14 -; GCN-IR-NEXT: s_add_u32 s14, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s15, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s2, s16 -; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 -; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while -; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s2, s14, s12 -; GCN-IR-NEXT: s_subb_u32 s2, s15, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 -; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s2, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 -; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 -; GCN-IR-NEXT: .LBB7_4: ; %Flow4 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] -; GCN-IR-NEXT: .LBB7_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s6 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-IR-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_alignbit_b32 v0, s5, v0, 24 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 +; GCN-IR-NEXT: v_alignbit_b32 v1, s3, v1, 24 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GCN-IR-NEXT: buffer_store_short v3, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index 37d1116e9ecc..bc1b102d33de 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -240,9 +240,11 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -344,8 +346,9 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll index 049db01badac..ab5306bc2452 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -7,7 +7,7 @@ @x = global i32 0 ;. -; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = global i32 0 +; CHECK: @x = global i32 0 ;. define void @foo() #0 { ; CHECK-LABEL: define {{[^@]+}}@foo @@ -21,7 +21,7 @@ define void @foo() #0 { define amdgpu_kernel void @kernel1() #1 { ; CHECK-LABEL: define {{[^@]+}}@kernel1 -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: ret void ; @@ -31,6 +31,5 @@ define amdgpu_kernel void @kernel1() #1 { attributes #0 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll index c9387f196dff..da0234c90363 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll @@ -2,8 +2,8 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck %s ;. -; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = global ptr null -; CHECK: @[[G2:[a-zA-Z0-9_$"\\.-]+]] = global i32 0 +; CHECK: @G1 = global ptr null +; CHECK: @G2 = global i32 0 ;. define weak void @weak() { ; CHECK-LABEL: define {{[^@]+}}@weak @@ -87,7 +87,7 @@ define internal void @internal2() { define amdgpu_kernel void @kernel2() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel2 -; CHECK-SAME: () #[[ATTR3]] { +; CHECK-SAME: () #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: call void @internal2() ; CHECK-NEXT: ret void ; @@ -97,8 +97,9 @@ define amdgpu_kernel void @kernel2() #0 { attributes #0 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll index 7183da2c5efc..9fe753fec0f9 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -7,7 +7,7 @@ @x = global i32 0 ;. -; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = global i32 0 +; CHECK: @x = global i32 0 ;. define void @func1() #0 { ; CHECK-LABEL: define {{[^@]+}}@func1 @@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 { attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll index 6ed04cf63d20..f0e0df00feff 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -7,7 +7,7 @@ @x = global i32 0 ;. -; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = global i32 0 +; CHECK: @x = global i32 0 ;. define void @func() #0 { ; CHECK-LABEL: define {{[^@]+}}@func @@ -31,7 +31,7 @@ define amdgpu_kernel void @kernel1() #1 { define amdgpu_kernel void @kernel2() #2 { ; CHECK-LABEL: define {{[^@]+}}@kernel2 -; CHECK-SAME: () #[[ATTR2:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: call void @func() ; CHECK-NEXT: ret void ; @@ -41,7 +41,6 @@ define amdgpu_kernel void @kernel2() #2 { attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll index d5ba2fd617c6..de83f91fce26 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -5,7 +5,7 @@ ; Propagate the uniform-work-group-attribute from the kernel to callee if it doesn't have it ;. -; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = global i32 0 +; CHECK: @x = global i32 0 ;. define void @func() #0 { ; CHECK-LABEL: define {{[^@]+}}@func @@ -52,8 +52,8 @@ attributes #0 = { nounwind } attributes #1 = { "uniform-work-group-size"="false" } attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { nounwind "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll index 7f0dfeaf75c8..dc19f4d879e8 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 { attributes #0 = { nounwind readnone } attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll index 8616c73ad51c..51f060a342fa 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -3,7 +3,7 @@ @x = global i32 0 ;. -; CHECK: @[[X:[a-zA-Z0-9_$"\\.-]+]] = global i32 0 +; CHECK: @x = global i32 0 ;. define void @func1() { ; CHECK-LABEL: define {{[^@]+}}@func1 @@ -49,7 +49,7 @@ define void @func3() { define amdgpu_kernel void @kernel3() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel3 -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: call void @func2() ; CHECK-NEXT: call void @func3() ; CHECK-NEXT: ret void @@ -61,6 +61,5 @@ define amdgpu_kernel void @kernel3() #0 { attributes #0 = { "uniform-work-group-size"="false" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform_branch_with_floating_point_cond.ll b/llvm/test/CodeGen/AMDGPU/uniform_branch_with_floating_point_cond.ll new file mode 100644 index 000000000000..28526aafa2e6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform_branch_with_floating_point_cond.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-isel < %s | FileCheck %s + +@external_constant1 = external addrspace(4) constant float, align 4 +@external_constant2 = external addrspace(1) constant float, align 4 +@const.ptr = external addrspace(4) constant ptr, align 4 + +define void @test() { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x30000000), %bb.3(0x50000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant1, target-flags(amdgpu-gotprel32-hi) @external_constant1, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[S_LOAD_DWORDX2_IMM]], 0, 0 :: (dereferenceable invariant load (s32) from @external_constant1, addrspace 4) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: nofpexcept S_CMP_LG_F32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc, implicit $mode + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @const.ptr, target-flags(amdgpu-gotprel32-hi) @const.ptr, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET1]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM killed [[S_LOAD_DWORDX2_IMM1]], 0, 0 :: (invariant load (s64) from @const.ptr, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM2]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32) from %ir.0, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1092616192 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; CHECK-NEXT: nofpexcept S_CMP_LT_F32 killed [[COPY]], killed [[S_MOV_B32_2]], implicit-def $scc, implicit $mode + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb2: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.Flow1: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb3: + ; CHECK-NEXT: successors: %bb.5(0x50000000), %bb.6(0x30000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_1]], %bb.1, [[S_MOV_B32_3]], %bb.2 + ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: nofpexcept S_CMP_NEQ_F32 [[PHI]], killed [[S_MOV_B32_4]], implicit-def $scc, implicit $mode + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.6, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.bb4: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant2, target-flags(amdgpu-gotprel32-hi) @external_constant2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET2]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1082130432, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_1]], killed [[V_MOV_B32_e32_2]], killed [[S_LOAD_DWORDX2_IMM3]], 0, 0, implicit $exec :: (store (s32) into @external_constant2, addrspace 1) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.Flow: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.bb5: + ; CHECK-NEXT: SI_RETURN +entry: + %ld1 = load float, ptr addrspace(4) @external_constant1 + %cmp1 = fcmp one float %ld1, 0.0 + br i1 %cmp1, label %bb5, label %bb1, !amdgpu.uniform !0 + +bb1: + %ptr = load ptr, ptr addrspace(4) @const.ptr + %ld2 = load float, ptr %ptr, align 4 + %cmp2 = fcmp olt float %ld2, 1.0 + %or = or i1 %cmp2, false + br i1 %or, label %bb3, label %bb2, !amdgpu.uniform !0 + +bb2: + br label %bb3 + +bb3: + %phi = phi float [ 10.0, %bb1 ], [ 0.0, %bb2 ] + %cmp3 = fcmp oeq float %phi, 0.0 + br i1 %cmp3, label %bb4, label %bb5, !amdgpu.uniform !0 + +bb4: + store float 4.0, ptr addrspace(1) @external_constant2 + br label %bb5 + +bb5: + ret void +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index b4f977db8043..a794d139063d 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -665,54 +665,47 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s13, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, 0, s0 -; GCN-NEXT: s_lshr_b32 s6, s15, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: s_lshr_b32 s6, s13, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GCN-NEXT: s_lshr_b32 s0, s15, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GCN-NEXT: s_lshr_b32 s7, s11, 9 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: s_sub_i32 s1, 0, s6 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-NEXT: s_lshr_b32 s1, s9, 1 -; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: s_mul_i32 s2, s2, s0 -; GCN-NEXT: s_sub_i32 s1, s1, s2 -; GCN-NEXT: s_sub_i32 s2, s1, s0 -; GCN-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-NEXT: s_cselect_b32 s1, s2, s1 -; GCN-NEXT: s_sub_i32 s2, s1, s0 -; GCN-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-NEXT: s_cselect_b32 s8, s2, s1 -; GCN-NEXT: s_sub_i32 s0, 0, s6 -; GCN-NEXT: v_mul_lo_u32 v0, s0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 +; GCN-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GCN-NEXT: s_lshr_b32 s8, s9, 1 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v5 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s7, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mul_i32 s4, s4, s6 -; GCN-NEXT: s_sub_i32 s4, s7, s4 +; GCN-NEXT: s_sub_i32 s4, s8, s4 ; GCN-NEXT: s_sub_i32 s5, s4, s6 ; GCN-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-NEXT: s_cselect_b32 s4, s5, s4 ; GCN-NEXT: s_sub_i32 s5, s4, s6 ; GCN-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -720,54 +713,47 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s0, s13, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 -; GCN-IR-NEXT: s_lshr_b32 s6, s15, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: s_lshr_b32 s6, s13, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GCN-IR-NEXT: s_lshr_b32 s0, s15, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GCN-IR-NEXT: s_lshr_b32 s7, s11, 9 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-IR-NEXT: s_sub_i32 s1, 0, s6 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1 -; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s1, s1, s2 -; GCN-IR-NEXT: s_sub_i32 s2, s1, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-IR-NEXT: s_cselect_b32 s1, s2, s1 -; GCN-IR-NEXT: s_sub_i32 s2, s1, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s1, s0 -; GCN-IR-NEXT: s_cselect_b32 s8, s2, s1 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s6 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s0, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 +; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v5, s1, v0 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GCN-IR-NEXT: s_lshr_b32 s8, s9, 1 +; GCN-IR-NEXT: v_mul_hi_u32 v5, v0, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, v0 ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 +; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-IR-NEXT: s_mul_i32 s4, s4, s6 -; GCN-IR-NEXT: s_sub_i32 s4, s7, s4 +; GCN-IR-NEXT: s_sub_i32 s4, s8, s4 ; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 ; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 ; GCN-IR-NEXT: s_sub_i32 s5, s4, s6 ; GCN-IR-NEXT: s_cmp_ge_u32 s4, s6 ; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 +; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll index 8cc7025d671c..775602ab80cd 100644 --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_usubsat_i8: @@ -27,13 +28,30 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_usubsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_usubsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_usubsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -60,11 +78,24 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_usubsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_usubsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_usubsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_usubsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -93,11 +124,23 @@ define i16 @usubsat_as_bithack_i16(i16 %x) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: usubsat_as_bithack_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: usubsat_as_bithack_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: usubsat_as_bithack_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: usubsat_as_bithack_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = xor i16 %x, 32768 %result = and i16 %signsplat, %flipsign @@ -128,11 +171,23 @@ define i16 @usubsat_as_bithack2_i16(i16 %x) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: usubsat_as_bithack2_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: usubsat_as_bithack2_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: usubsat_as_bithack2_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: usubsat_as_bithack2_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 %result = and i16 %signsplat, %flipsign @@ -163,11 +218,23 @@ define i16 @usubsat_as_bithack_commute_i16(i16 %x) { ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: usubsat_as_bithack_commute_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: usubsat_as_bithack_commute_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: usubsat_as_bithack_commute_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 0x8000 clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: usubsat_as_bithack_commute_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 %result = and i16 %flipsign, %signsplat diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll index d4f7bf656d3b..bad0be16e75c 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s -; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s -; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s define half @swap(half %a, half %b, i32 %i) { ; GFX11-TRUE16-LABEL: swap: diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll index aec86ec343bd..0af8c95da8d8 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -368,11 +368,11 @@ define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %ou ; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array( ; OPT-NOT: alloca ; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4 -; OPT-NEXT: %out.repack1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 4 +; OPT-NEXT: %out.repack1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 4 ; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4 -; OPT-NEXT: %out.repack2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 8 +; OPT-NEXT: %out.repack2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 8 ; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4 -; OPT-NEXT: %out.repack3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 12 +; OPT-NEXT: %out.repack3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 12 ; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4 ; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array: @@ -394,11 +394,11 @@ define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspa ; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array( ; OPT-NOT: alloca ; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4 -; OPT-NEXT: %out.repack1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 4 +; OPT-NEXT: %out.repack1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 4 ; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4 -; OPT-NEXT: %out.repack2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 8 +; OPT-NEXT: %out.repack2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 8 ; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4 -; OPT-NEXT: %out.repack3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 12 +; OPT-NEXT: %out.repack3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i64 12 ; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4 ; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir index c2badc5720f1..edea344a66a3 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir @@ -153,3 +153,37 @@ body: | ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5) ... + +--- +name: spill_v32_undef +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + ; CHECK-LABEL: name: spill_v32_undef + ; CHECK: S_NOP 0, implicit undef $vgpr0 + SI_SPILL_V32_SAVE undef $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + S_NOP 0, implicit undef $vgpr0 +... + +--- +name: spill_v64_undef +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 8, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' +body: | + bb.0: + ; CHECK-LABEL: name: spill_v64_undef + ; CHECK: S_NOP 0, implicit undef $vgpr0_vgpr1 + SI_SPILL_V64_SAVE undef $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + S_NOP 0, implicit undef $vgpr0_vgpr1 +... diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir index b94e5c450cd1..26b03e7cdf8a 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir @@ -2,11 +2,13 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX12 %s --- | define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void } define amdgpu_kernel void @max-counter-vmcnt() #0 { ret void } define amdgpu_kernel void @max-counter-expcnt() #0 { ret void } + define amdgpu_kernel void @max-counter-dscnt() #0 { ret void } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ... @@ -112,6 +114,41 @@ body: | ; GFX11-NEXT: S_WAITCNT 64743 ; GFX11-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-lgkmcnt + ; GFX12: liveins: $vgpr99 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec + ; GFX12-NEXT: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec + ; GFX12-NEXT: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec + ; GFX12-NEXT: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec + ; GFX12-NEXT: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec + ; GFX12-NEXT: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec + ; GFX12-NEXT: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec + ; GFX12-NEXT: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec + ; GFX12-NEXT: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec + ; GFX12-NEXT: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec + ; GFX12-NEXT: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec + ; GFX12-NEXT: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec + ; GFX12-NEXT: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec + ; GFX12-NEXT: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec + ; GFX12-NEXT: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec + ; GFX12-NEXT: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec + ; GFX12-NEXT: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec + ; GFX12-NEXT: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 17 + ; GFX12-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 16 + ; GFX12-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 15 + ; GFX12-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 14 + ; GFX12-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec @@ -377,6 +414,87 @@ body: | ; GFX11-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec ; GFX11-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-vmcnt + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_LOADCNT 62 + ; GFX12-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec @@ -502,6 +620,24 @@ body: | ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec ; GFX11-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-expcnt + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec @@ -513,3 +649,245 @@ body: | $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec S_ENDPGM 0 ... + +--- +name: max-counter-dscnt +body: | + bb.0: + liveins: $vgpr99 + + ; GFX9-LABEL: name: max-counter-dscnt + ; GFX9: liveins: $vgpr99 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: S_WAITCNT 52863 + ; GFX9-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0 + ; GFX10-LABEL: name: max-counter-dscnt + ; GFX10: liveins: $vgpr99 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: S_WAITCNT 0 + ; GFX10-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: S_WAITCNT 59263 + ; GFX10-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: max-counter-dscnt + ; GFX11: liveins: $vgpr99 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: S_WAITCNT 0 + ; GFX11-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: S_WAITCNT 65143 + ; GFX11-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-dscnt + ; GFX12: liveins: $vgpr99 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 39 + ; GFX12-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + S_ENDPGM 0 +... |
