diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
51 files changed, 14907 insertions, 6196 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll deleted file mode 100644 index aefcad491073..000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll +++ /dev/null @@ -1,72 +0,0 @@ -; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s - -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4) -; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4 -; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca - -define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) { - %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id - %n = load i32, ptr addrspace(1) %gep - %alloca = alloca i32, i32 %n, align 4, addrspace(5) - store volatile i32 123, ptr addrspace(5) %alloca - ret void -} - -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align) -; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align -; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca - -define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) { - %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id - %n = load i32, ptr addrspace(1) %gep - %alloca = alloca i32, i32 %n, addrspace(5) - store volatile i32 123, ptr addrspace(5) %alloca - ret void -} -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64) -; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64 -; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca - -define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) { - %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id - %n = load i32, ptr addrspace(1) %gep - %alloca = alloca i32, i32 %n, align 64, addrspace(5) - store volatile i32 123, ptr addrspace(5) %alloca - ret void -} - -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) -; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 -; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca - -define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) { - %alloca = alloca i32, i32 %n, align 4, addrspace(5) - store volatile i32 456, ptr addrspace(5) %alloca - ret void -} - -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align) -; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align -; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca - -define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) { - %alloca = alloca i32, i32 %n, addrspace(5) - store volatile i32 456, ptr addrspace(5) %alloca - ret void -} -; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64) -; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64 -; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca - -define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) { - %alloca = alloca i32, i32 %n, align 64, addrspace(5) - store volatile i32 456, ptr addrspace(5) %alloca - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #0 - -attributes #0 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index b0f2aac9a42d..7cafa2f608a4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -3990,6 +3990,116 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) { +; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: v_add_u32_e32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: v_add3_u32 v0, s2, v0, -16 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:-16 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: v_add_u32_e32 v0, s1, v0 +; GFX940-NEXT: v_add3_u32 v0, s0, v0, -16 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm +; +; UNALIGNED_GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; UNALIGNED_GFX9: ; %bb.0: ; %bb +; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, s3, v0 +; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; UNALIGNED_GFX9-NEXT: v_add3_u32 v0, s2, v0, -16 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX9-NEXT: scratch_store_dword v0, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: s_endpgm +; +; UNALIGNED_GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; UNALIGNED_GFX10: ; %bb.0: ; %bb +; UNALIGNED_GFX10-NEXT: s_add_u32 s0, s0, s5 +; UNALIGNED_GFX10-NEXT: s_addc_u32 s1, s1, 0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v1, off offset:-16 +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: s_endpgm +; +; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; UNALIGNED_GFX940: ; %bb.0: ; %bb +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v0, s1, v0 +; UNALIGNED_GFX940-NEXT: v_add3_u32 v0, s0, v0, -16 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: s_endpgm +; +; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; UNALIGNED_GFX11: ; %bb.0: ; %bb +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: s_endpgm +; +; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; UNALIGNED_GFX12: ; %bb.0: ; %bb +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: s_endpgm +bb: + %add1 = add nsw i32 %sidx, %vidx + %add2 = add nsw i32 %add1, -16 + %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2 + store volatile i32 15, ptr addrspace(5) %gep, align 4 + ret void +} + define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) { ; GFX9-LABEL: sgpr_base_negative_offset: ; GFX9: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index 0577117e9d9e..d81faf91801b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -10,10 +10,10 @@ define float @v_pow_f32(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -25,19 +25,19 @@ define float @v_pow_f32(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -49,19 +49,19 @@ define float @v_pow_f32(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -73,17 +73,18 @@ define float @v_pow_f32(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -91,32 +92,34 @@ define float @v_pow_f32(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -127,111 +130,114 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 5, v4 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, 0x42000000 -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc -; GFX6-NEXT: v_sub_f32_e32 v0, v0, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX6-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x42800000 +; GFX6-NEXT: v_mov_b32_e32 v6, 0x42800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5] +; GFX6-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5] ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_not_b32_e32 v4, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4 -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v4 ; GFX8-NEXT: v_log_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v4 ; GFX8-NEXT: v_log_f32_e32 v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, 0x42000000 -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x42800000 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x42800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5] +; GFX8-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5] ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX8-NEXT: v_exp_f32_e32 v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_not_b32_e32 v4, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX8-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 5, v4 ; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_ldexp_f32 v1, v1, v4 ; GFX9-NEXT: v_log_f32_e32 v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x42000000 -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x42800000 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x42800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5] ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_not_b32_e32 v4, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f32: @@ -239,10 +245,12 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 @@ -257,46 +265,54 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s4 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s4 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v1, v1, v5 ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: v_log_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v4, 5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s0 -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) ret <2 x float> %pow @@ -316,9 +332,9 @@ define half @v_pow_f16(half %x, half %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -388,18 +404,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; GFX6-NEXT: v_not_b32_e32 v3, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -508,17 +524,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_not_b32_e32 v5, 63 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_exp_f32_e32 v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v1, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -634,17 +650,17 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_not_b32_e32 v5, 63 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -764,17 +780,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_not_b32_e32 v5, 63 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: v_exp_f32_e32 v2, v2 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v1, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -885,10 +901,10 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -900,19 +916,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -924,19 +940,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_lhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -948,17 +964,18 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0| -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -966,9 +983,9 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_lhs: @@ -976,23 +993,24 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %pow = call float @llvm.pow.f32(float %fabs.x, float %y) @@ -1004,10 +1022,10 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1019,19 +1037,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1043,19 +1061,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1067,17 +1085,18 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1085,32 +1104,34 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %x, float %fabs.y) @@ -1122,10 +1143,10 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1137,19 +1158,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1161,19 +1182,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1185,17 +1206,18 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0| -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1203,9 +1225,9 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs: @@ -1213,23 +1235,24 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) @@ -1241,10 +1264,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX6-LABEL: v_pow_f32_sgpr_vgpr: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v1, s0, v1 ; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1256,18 +1279,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_vgpr: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX8-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX8-NEXT: v_log_f32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1279,18 +1302,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX9-NEXT: v_log_f32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1302,49 +1325,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_vgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1 -; GFX10-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX10-NEXT: v_log_f32_e32 v1, v1 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1 -; GFX11-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX11-NEXT: v_log_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1354,10 +1379,10 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX6-LABEL: v_pow_f32_vgpr_sgpr: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1369,18 +1394,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_vgpr_sgpr: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1392,18 +1417,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_vgpr_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1415,16 +1440,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_vgpr_sgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1432,31 +1458,33 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_vgpr_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1466,10 +1494,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX6-LABEL: v_pow_f32_sgpr_sgpr: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX6-NEXT: v_ldexp_f32_e32 v0, s0, v0 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1481,18 +1509,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_sgpr: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX8-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1504,18 +1532,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX9-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1527,49 +1555,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_sgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX10-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 -; GFX11-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1580,10 +1610,10 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v0, -v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1595,19 +1625,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1619,19 +1649,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fneg_lhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1643,17 +1673,18 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fneg_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, -v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1661,9 +1692,9 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fneg_lhs: @@ -1671,23 +1702,24 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %pow = call float @llvm.pow.f32(float %neg.x, float %y) @@ -1699,10 +1731,10 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1714,19 +1746,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1738,19 +1770,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1762,17 +1794,18 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fneg_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1780,32 +1813,34 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fneg_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %pow = call float @llvm.pow.f32(float %x, float %neg.y) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir index 55015c6d13d8..cdb67caea12c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir @@ -20,8 +20,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -58,8 +58,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir index 4241f945a87d..ed811d37c3d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir @@ -20,8 +20,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 @@ -58,8 +58,8 @@ body: | ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] - ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec - ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]] ; ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll index eeb7b138fde3..fe002d69faf6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -18,9 +18,9 @@ define i16 @v_powi_f16(i16 %l, i32 %r) { ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_exp_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_not_b32_e32 v1, 63 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -75,53 +75,80 @@ define i16 @v_powi_f16(i16 %l, i32 %r) { } define float @v_powi_f32(float %l, i32 %r) { -; GFX78-LABEL: v_powi_f32: -; GFX78: ; %bb.0: -; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX78-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_powi_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_log_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_exp_f32_e32 v0, v0 +; GFX7-NEXT: v_not_b32_e32 v1, 63 +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_powi_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_log_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_exp_f32_e32 v0, v0 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 %r) ret float %res diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir index 5378ce2d1efa..10517a49e697 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir @@ -491,3 +491,132 @@ body: | %1:_(p5) = G_DYN_STACKALLOC %0, 32 S_ENDPGM 0, implicit %1 ... + +--- +name: test_dyn_stackalloc_vgpr_align4 +legalized: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: variable-sized, alignment: 4 } +body: | + bb.0: + liveins: $vgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align4 + ; WAVE64: liveins: $vgpr0 + ; WAVE64-NEXT: {{ $}} + ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0 + ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32) + ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + ; + ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align4 + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0 + ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32) + ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + %0:_(s32) = COPY $vgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 4 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_vgpr_align16 +legalized: true +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, type: variable-sized, alignment: 16 } +body: | + bb.0: + liveins: $vgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align16 + ; WAVE64: liveins: $vgpr0 + ; WAVE64-NEXT: {{ $}} + ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0 + ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32) + ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + ; + ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align16 + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0 + ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32) + ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5) + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5) + %0:_(s32) = COPY $vgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 16 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_vgpr_align64 +legalized: true +frameInfo: + maxAlignment: 64 +stack: + - { id: 0, type: variable-sized, alignment: 64 } +body: | + bb.0: + liveins: $vgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align64 + ; WAVE64: liveins: $vgpr0 + ; WAVE64-NEXT: {{ $}} + ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0 + ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32) + ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096 + ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) + ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; + ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align64 + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0 + ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32) + ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047 + ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32) + ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 + ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32) + ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5) + ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5) + %0:_(s32) = COPY $vgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 64 + S_ENDPGM 0, implicit %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll new file mode 100644 index 000000000000..52259c4c2e6e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +; Reduce a 64-bit add by a constant if we know the low 32-bits are all +; zero. + +; add i64:x, K if computeTrailingZeros(K) >= 32 +; => build_pair (add x.hi, K.hi), x.lo + +define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) { +; GFX9-LABEL: s_add_i64_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 0x40000 +; GFX9-NEXT: ; return to shader part epilog + %add = add i64 %reg, 1125899906842624 ; (1 << 50) + ret i64 %add +} + +define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) { +; GFX9-LABEL: s_add_i64_const_low_bits_known0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: ; return to shader part epilog + %add = add i64 %reg, 4294967296 ; (1 << 32) + ret i64 %add +} + +define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) { +; GFX9-LABEL: s_add_i64_const_low_bits_known0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 2 +; GFX9-NEXT: ; return to shader part epilog + %add = add i64 %reg, 8589934592 ; (1 << 33) + ret i64 %add +} + +define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) { +; GFX9-LABEL: s_add_i64_const_low_bits_known0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000 +; GFX9-NEXT: ; return to shader part epilog + %add = add i64 %reg, -9223372036854775808 ; (1 << 63) + ret i64 %add +} + +define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) { +; GFX9-LABEL: s_add_i64_const_low_bits_known0_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: ; return to shader part epilog + %add = add i64 %reg, -4294967296 ; 0xffffffff00000000 + ret i64 %add +} + +define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) { +; GFX9-LABEL: v_add_i64_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0x40000, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %reg, 1125899906842624 ; (1 << 50) + ret i64 %add +} + +define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) { +; GFX9-LABEL: v_add_i64_const_low_bits_known0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %reg, 4294967296 ; (1 << 32) + ret i64 %add +} + +define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) { +; GFX9-LABEL: v_add_i64_const_low_bits_known0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %reg, 8589934592 ; (1 << 33) + ret i64 %add +} + +define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) { +; GFX9-LABEL: v_add_i64_const_low_bits_known0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %reg, -9223372036854775808 ; (1 << 63) + ret i64 %add +} + +define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) { +; GFX9-LABEL: v_add_i64_const_low_bits_known0_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %reg, -4294967296 ; 0xffffffff00000000 + ret i64 %add +} + +define amdgpu_ps i64 @s_add_i64_const_high_bits_known0_0(i64 inreg %reg) { +; GFX9-LABEL: s_add_i64_const_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s0, -1 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: ; return to shader part epilog + %add = add i64 %reg, 4294967295 ; (1 << 31) + ret i64 %add +} + +define i64 @v_add_i64_const_high_bits_known0_0(i64 %reg) { +; GFX9-LABEL: v_add_i64_const_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add i64 %reg, 4294967295 ; (1 << 31) + ret i64 %add +} + +define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) { +; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32) + ret <2 x i64> %add +} + +define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { +; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33) + ret <2 x i64> %add +} + +define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { +; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: ; return to shader part epilog + %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32) + ret <2 x i64> %add +} + +define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { +; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 2 +; GFX9-NEXT: ; return to shader part epilog + %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33) + ret <2 x i64> %add +} + +; We could reduce this to use a 32-bit add if we use computeKnownBits +define i64 @v_add_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) { +; GFX9-LABEL: v_add_i64_variable_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %zext.offset.hi32 = zext i32 %offset.hi32 to i64 + %in.high.bits = shl i64 %zext.offset.hi32, 32 + %add = add i64 %reg, %in.high.bits + ret i64 %add +} + +; We could reduce this to use a 32-bit add if we use computeKnownBits +define amdgpu_ps i64 @s_add_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) { +; GFX9-LABEL: s_add_i64_variable_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s0, 0 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 +; GFX9-NEXT: ; return to shader part epilog + %zext.offset.hi32 = zext i32 %offset.hi32 to i64 + %in.high.bits = shl i64 %zext.offset.hi32, 32 + %add = add i64 %reg, %in.high.bits + ret i64 %add +} diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 5b72795ba07e..b128be2186df 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s -;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-LABEL: fmul_select_f32_test1: @@ -21,22 +25,22 @@ define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f32_test1: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test1: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f32_test1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f32_test1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 2.000000e+00, float 1.000000e+00 %ldexp = fmul float %x, %y @@ -60,22 +64,22 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f32_test2: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test2: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f32_test2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f32_test2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 5.000000e-01, float 1.000000e+00 %ldexp = fmul float %x, %y @@ -83,49 +87,71 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) { } define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-LABEL: fmul_select_v2f32_test3: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_v2f32_test3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_v2f32_test3: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f32_test3: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_v2f32_test3: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_v2f32_test3: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_v2f32_test3: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_v2f32_test3: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_select_v2f32_test3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_v2f32_test3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00> %ldexp = fmul <2 x float> %x, %y @@ -133,49 +159,71 @@ define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1 } define <2 x float> @fmul_select_v2f32_test4(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-LABEL: fmul_select_v2f32_test4: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_v2f32_test4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_v2f32_test4: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f32_test4: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_v2f32_test4: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_v2f32_test4: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_v2f32_test4: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_v2f32_test4: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_select_v2f32_test4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_v2f32_test4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x float> <float 5.000000e-01, float 5.000000e-01>, <2 x float> <float 1.000000e+00, float 1.000000e+00> %ldexp = fmul <2 x float> %x, %y @@ -199,22 +247,22 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f32_test5: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test5: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f32_test5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f32_test5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float -2.000000e+00, float -1.000000e+00 %ldexp = fmul float %x, %y @@ -222,44 +270,83 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) { } define float @fmul_select_f32_test6(float %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f32_test6: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xc0400000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f32_test6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x41000000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc0400000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f32_test6: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0xc0400000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test6: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0xc0400000 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f32_test6: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0400000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f32_test6: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0400000 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f32_test6: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0400000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f32_test6: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0400000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f32_test6: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xc0400000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f32_test6: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f32_test6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xc0400000 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f32_test6: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float -3.000000e+00, float 8.000000e+00 %ldexp = fmul float %x, %y @@ -285,22 +372,22 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool. ; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo -; GFX1030-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 0x43A0000000000000, float 0x45B0000000000000 %ldexp = fmul float %x, %y @@ -308,44 +395,83 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool. } define float @fmul_select_f32_test8(float %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f32_test8: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, 0xc1000000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x41800000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f32_test8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xc1000000 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x41800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f32_test8: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x41800000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test8: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x41800000 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f32_test8: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xc1000000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f32_test8: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0xc1000000 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f32_test8: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc1000000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f32_test8: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc1000000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f32_test8: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f32_test8: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1000000 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f32_test8: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f32_test8: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1000000 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 1.600000e+01, float -8.000000e+00 %ldexp = fmul float %x, %y @@ -369,22 +495,22 @@ define float @fmul_select_f32_test9(float %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f32_test9: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test9: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f32_test9: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f32_test9: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 0.000000e+00, float 2.000000e+00 %ldexp = fmul float %x, %y @@ -410,22 +536,22 @@ define float @fmul_select_f32_test10(float %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f32_test10: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test10: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f32_test10: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f32_test10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float -0.000000e+00, float 0.000000e+00 %ldexp = fmul float %x, %y @@ -451,22 +577,22 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool ; GFX9-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo -; GFX1030-NEXT: v_ldexp_f32 v0, -v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f32 v0, -v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo +; GFX10-NEXT: v_ldexp_f32 v0, -v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, -v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 0xC4D0000000000000, float 0xC370000000000000 %ldexp = fmul float %x, %y @@ -474,44 +600,83 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool } define float @fmul_select_f32_test12_sel_log2val_neg48_pos68(float %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, 0x44 -; GFX7-NEXT: v_not_b32_e32 v4, 47 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x44 -; GFX9-NEXT: v_not_b32_e32 v4, 47 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_not_b32_e32 v3, 47 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo -; GFX1030-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_not_b32_e32 v3, 47 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo -; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x44 +; GFX7-SDAG-NEXT: v_not_b32_e32 v4, 47 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_not_b32_e32 v3, 47 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x44 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x44 +; GFX9-SDAG-NEXT: v_not_b32_e32 v4, 47 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_not_b32_e32 v3, 47 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x44 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_not_b32_e32 v3, 47 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x44 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_not_b32_e32 v3, 47 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x44 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, float 0x3CF0000000000000, float 0x4430000000000000 %ldexp = fmul float %x, %y @@ -535,22 +700,22 @@ define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f64_test1: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test1: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f64_test1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 2.000000e+00, double 1.000000e+00 %ldexp = fmul double %x, %y @@ -574,22 +739,22 @@ define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f64_test2: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test2: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f64_test2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 5.000000e-01, double 1.000000e+00 %ldexp = fmul double %x, %y @@ -619,28 +784,28 @@ define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.ar ; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_v2f64_test3: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f64_test3: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_v2f64_test3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_v2f64_test3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00> %ldexp = fmul <2 x double> %x, %y @@ -670,28 +835,28 @@ define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.ar ; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_v2f64_test4: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f64_test4: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo -; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_v2f64_test4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_v2f64_test4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo +; GFX11-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 5.000000e-01>, <2 x double> <double 1.000000e+00, double 1.000000e+00> %ldexp = fmul <2 x double> %x, %y @@ -715,22 +880,22 @@ define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f64_test5: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test5: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f64_test5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double -5.000000e-01, double -1.000000e+00 %ldexp = fmul double %x, %y @@ -754,22 +919,22 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f64_test6: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test6: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f64_test6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double -2.000000e+00, double -1.000000e+00 %ldexp = fmul double %x, %y @@ -777,44 +942,64 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test7(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f64_test7: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0xbff00000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f64_test7: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xbff00000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f64_test7: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test7: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_mov_b32_e32 v4, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f64_test7: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xbff00000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc +; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f64_test7: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, 0xbff00000 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 2.0, vcc +; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f64_test7: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xbff00000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f64_test7: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xbff00000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 2.0, vcc +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_select_f64_test7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 2.000000e+00, double -1.000000e+00 %ldexp = fmul double %x, %y @@ -838,22 +1023,22 @@ define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f64_test8: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test8: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f64_test8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double -4.000000e+00, double -3.200000e+01 %ldexp = fmul double %x, %y @@ -883,28 +1068,28 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar ; GFX9-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_v2f64_test9: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f64_test9: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX1100-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_v2f64_test9: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX10-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_v2f64_test9: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double -2.000000e+00, double -2.000000e+00>, <2 x double> <double -1.000000e+00, double -1.000000e+00> %ldexp = fmul <2 x double> %x, %y @@ -912,60 +1097,115 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar } define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-LABEL: fmul_select_v2f64_test10: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, 0xbff00000 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x3fe00000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] -; GFX7-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_v2f64_test10: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, 0xbff00000 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x3fe00000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] -; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_v2f64_test10: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x3fe00000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1030-NEXT: v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] -; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f64_test10: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v8, 0x3fe00000 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] -; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_v2f64_test10: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v8, 0xbff00000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v9, 0x3fe00000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v8, 0 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_v2f64_test10: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, 0x3fe00000 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX7-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_v2f64_test10: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v8, 0xbff00000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v9, 0x3fe00000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_v2f64_test10: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v9, 0x3fe00000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX9-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_v2f64_test10: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v8, 0x3fe00000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX10-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_v2f64_test10: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v9, 0xbff00000 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_v2f64_test10: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, 0x3fe00000 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_v2f64_test10: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v9, 0xbff00000 :: v_dual_mov_b32 v8, 0 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 2.000000e+00>, <2 x double> <double -1.000000e+00, double 1.000000e+00> %ldexp = fmul <2 x double> %x, %y @@ -973,44 +1213,64 @@ define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.a } define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f64_test11: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f64_test11: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f64_test11: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test11: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_mov_b32_e32 v4, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f64_test11: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc +; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f64_test11: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -2.0, vcc +; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f64_test11: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f64_test11: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -2.0, vcc +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: fmul_select_f64_test11: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double -2.000000e+00, double -0.000000e+00 %ldexp = fmul double %x, %y @@ -1018,45 +1278,84 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f64_test12: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 31, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f64_test12: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 31, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f64_test12: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 31, v3 -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test12: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3 -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f64_test12: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v2 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f64_test12: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f64_test12: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v2 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f64_test12: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f64_test12: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v3 +; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f64_test12: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f64_test12: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3 +; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f64_test12: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 0.000000e+00, double -0.000000e+00 %ldexp = fmul double %x, %y @@ -1084,24 +1383,24 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f64_test13: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo -; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test13: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: v_mov_b32_e32 v4, 0 -; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f64_test13: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f64_test13: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 0.000000e+00, double 1.600000e+01 %ldexp = fmul double %x, %y @@ -1109,44 +1408,83 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2) } define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_not_b32_e32 v4, 26 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x5c -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_not_b32_e32 v4, 26 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x5c -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v4, 0x5c -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v4, 0x5c -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo -; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_not_b32_e32 v4, 26 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v5, 0x5c +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX7-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x5c +; GFX7-GISEL-NEXT: v_not_b32_e32 v5, 26 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX7-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_not_b32_e32 v4, 26 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x5c +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5c +; GFX9-GISEL-NEXT: v_not_b32_e32 v5, 26 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x5c +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_not_b32_e32 v4, 26 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5c +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_not_b32_e32 v4, 26 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 0x45B0000000000000, double 0x3E40000000000000 %ldexp = fmul double %x, %y @@ -1154,44 +1492,83 @@ define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bo } define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_not_b32_e32 v4, 32 -; GFX7-NEXT: v_not_b32_e32 v5, 41 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_not_b32_e32 v4, 32 -; GFX9-NEXT: v_not_b32_e32 v5, 41 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_not_b32_e32 v4, 41 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo -; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_not_b32_e32 v4, 41 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo -; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_not_b32_e32 v4, 32 +; GFX7-SDAG-NEXT: v_not_b32_e32 v5, 41 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX7-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_not_b32_e32 v4, 41 +; GFX7-GISEL-NEXT: v_not_b32_e32 v5, 32 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX7-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_not_b32_e32 v4, 32 +; GFX9-SDAG-NEXT: v_not_b32_e32 v5, 41 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_not_b32_e32 v4, 41 +; GFX9-GISEL-NEXT: v_not_b32_e32 v5, 32 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_not_b32_e32 v4, 41 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_not_b32_e32 v4, 32 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_not_b32_e32 v4, 41 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo +; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_not_b32_e32 v4, 32 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, double 0x3D50000000000000, double 0x3DE0000000000000 %ldexp = fmul double %x, %y @@ -1200,40 +1577,82 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f16_test1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f16_test1: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test1: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test1: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test1: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f16_test1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f16_test1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f16_test1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f16_test1: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f16_test1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f16_test1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 2.000000e+00, half 1.000000e+00 %ldexp = fmul half %x, %y @@ -1241,47 +1660,89 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f16_test2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2 -; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f16_test2: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: s_movk_i32 s4, 0x8000 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff -; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test2: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_movk_i32 s0, 0x8000 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff -; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test2: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test2: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f16_test2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f16_test2: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f16_test2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-SDAG-NEXT: v_med3_i32 v1, v1, s4, 0x7fff +; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f16_test2: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f16_test2: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f16_test2: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 5.000000e-01, half 1.000000e+00 %ldexp = fmul half %x, %y @@ -1289,59 +1750,126 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-LABEL: fmul_select_v2f16_test3: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_v2f16_test3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_v2f16_test3: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo -; GFX1030-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1030-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f16_test3: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo -; GFX1100-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_v2f16_test3: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_v2f16_test3: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_v2f16_test3: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3c00 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x4000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_v2f16_test3: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4 +; GFX9-GISEL-NEXT: v_med3_i32 v2, v2, v3, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_v2f16_test3: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo +; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_v2f16_test3: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1 +; GFX10-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_v2f16_test3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_v2f16_test3: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v2 +; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> <half 1.000000e+00, half 1.000000e+00> %ldexp = fmul <2 x half> %x, %y @@ -1349,59 +1877,126 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, } define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-LABEL: fmul_select_v2f16_test4: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_v2f16_test4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x3800 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_v2f16_test4: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x3800 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo -; GFX1030-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1030-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2f16_test4: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v5, 0x3800 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo -; GFX1100-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_v2f16_test4: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_v2f16_test4: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_v2f16_test4: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3c00 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3800 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_v2f16_test4: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4 +; GFX9-GISEL-NEXT: v_med3_i32 v2, v2, v3, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_v2f16_test4: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x3800 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo +; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_v2f16_test4: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1 +; GFX10-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_v2f16_test4: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3800 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo +; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_v2f16_test4: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v2 +; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x half> <half 5.000000e-01, half 5.000000e-01>, <2 x half> <half 1.000000e+00, half 1.000000e+00> %ldexp = fmul <2 x half> %x, %y @@ -1409,15 +2004,25 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, } define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test5: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test5: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test5: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test5: ; GFX9: ; %bb.0: @@ -1427,22 +2032,22 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f16_test5: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo -; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test5: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f16_test5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo +; GFX10-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f16_test5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 2.000000e+00, half 8.000000e+00 %ldexp = fmul half %x, %y @@ -1450,46 +2055,88 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test6: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1000000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f16_test6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4200 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc800 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f16_test6: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0xc800 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test6: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0xc800 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test6: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test6: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0xc800 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x4200 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f16_test6: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4200 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xc800 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f16_test6: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc800 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4200 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f16_test6: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xc800 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo +; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f16_test6: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4200 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f16_test6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xc800 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f16_test6: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4200 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half -8.000000e+00, half 3.000000e+00 %ldexp = fmul half %x, %y @@ -1497,45 +2144,87 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test7: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f16_test7: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xc400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4800 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f16_test7: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4800 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test7: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v3, 0x4800 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test7: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test7: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x4800 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0xc400 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f16_test7: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc400 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4800 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f16_test7: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4800 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc400 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f16_test7: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4800 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo +; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f16_test7: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xc400 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f16_test7: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4800 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo +; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f16_test7: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc400 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 8.000000e+00, half -4.000000e+00 %ldexp = fmul half %x, %y @@ -1543,16 +2232,28 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test8: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test8: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test8: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test8: ; GFX9: ; %bb.0: @@ -1563,22 +2264,22 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX1030-LABEL: fmul_select_f16_test8: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo -; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test8: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_f16_test8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_f16_test8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half -0.000000e+00, half 0.000000e+00 %ldexp = fmul half %x, %y @@ -1586,40 +2287,87 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test9: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f16_test9: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc -; GFX9-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f16_test9: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo -; GFX1030-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test9: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test9: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test9: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 5, v1 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f16_test9: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc +; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f16_test9: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 5, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f16_test9: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo +; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f16_test9: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f16_test9: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f16_test9: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1 +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01 %ldexp = fmul half %x, %y @@ -1627,47 +2375,82 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2 -; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: s_movk_i32 s4, 0x8000 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo -; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff -; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_movk_i32 s0, 0x8000 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff -; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo +; GFX10-SDAG-NEXT: v_med3_i32 v1, v1, s4, 0x7fff +; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 0xH1000, half 0xH6800 %ldexp = fmul half %x, %y @@ -1675,47 +2458,82 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a } define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2 -; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: s_movk_i32 s4, 0x8000 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo -; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff -; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_movk_i32 s0, 0x8000 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff -; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo +; GFX10-SDAG-NEXT: v_med3_i32 v1, v1, s4, 0x7fff +; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, half 0xH5800, half 0xH0400 %ldexp = fmul half %x, %y @@ -1723,72 +2541,114 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar } define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test1: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test1: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test1: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test1: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test1: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f80 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test1: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test1: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00 %ldexp = fmul bfloat %x, %y @@ -1796,72 +2656,114 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test2: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f00 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test2: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x3f00 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test2: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test2: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test2: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f80 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3f00 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test2: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test2: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f00 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test2: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test2: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test2: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00 %ldexp = fmul bfloat %x, %y @@ -1869,111 +2771,158 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-LABEL: fmul_select_v2bf16_test3: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_v2bf16_test3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_v2bf16_test3: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX1030-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX1030-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1030-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1030-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX1030-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1030-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1030-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2bf16_test3: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1100-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1100-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX1100-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1100-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1100-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f80 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x4000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00> %ldexp = fmul <2 x bfloat> %x, %y @@ -1981,111 +2930,158 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a } define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-LABEL: fmul_select_v2bf16_test4: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_v2bf16_test4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x3f00 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_v2bf16_test4: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x3f00 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX1030-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX1030-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX1030-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1030-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1030-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX1030-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1030-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1030-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_v2bf16_test4: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_mov_b32_e32 v5, 0x3f00 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX1100-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1100-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX1100-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1100-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1100-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f80 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3f00 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00> %ldexp = fmul <2 x bfloat> %x, %y @@ -2093,73 +3089,108 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a } define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test5: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test5: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test5: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test5: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test5: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test5: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4100 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test5: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test5: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test5: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test5: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test5: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00 %ldexp = fmul bfloat %x, %y @@ -2167,74 +3198,116 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test6: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1000000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4040 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc100 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test6: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0xffffc100 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test6: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test6: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test6: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0xc100 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x4040 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test6: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4040 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc100 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test6: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc100 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4040 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test6: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc100 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test6: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test6: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test6: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00 %ldexp = fmul bfloat %x, %y @@ -2242,73 +3315,115 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test7: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test7: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc080 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4100 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test7: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test7: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test7: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test7: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x4100 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0xc080 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test7: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc080 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4100 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test7: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4100 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc080 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test7: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4100 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test7: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080 +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test7: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test7: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080 +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00 %ldexp = fmul bfloat %x, %y @@ -2316,73 +3431,111 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test8: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test8: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX1030-NEXT: v_lshlrev_b16 v1, 15, v1 -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test8: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_lshlrev_b16 v1, 15, v1 -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test8: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test8: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test8: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-SDAG-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test8: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test8: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 15, v1 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test8: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo +; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test8: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b16 v1, 15, v1 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test8: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00 %ldexp = fmul bfloat %x, %y @@ -2390,74 +3543,121 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test9: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xc2000000 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1800000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test9: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc200 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc180 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test9: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0xffffc180 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test9: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test9: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2000000 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1800000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test9: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 5, v1 +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test9: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc200 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc180 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test9: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 5, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test9: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc180 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test9: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test9: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test9: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1 +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01 %ldexp = fmul bfloat %x, %y @@ -2465,74 +3665,111 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xdb800000 -; GFX7-NEXT: v_bfrev_b32_e32 v4, 7 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffdb80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffe000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0xffffe000 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xdb800000 +; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v4, 7 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x41 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffdb80 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffe000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffe000 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80 %ldexp = fmul bfloat %x, %y @@ -2540,74 +3777,111 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b } define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_bfrev_b32_e32 v3, 50 -; GFX7-NEXT: v_mov_b32_e32 v4, 0x34800000 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4c00 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3480 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1030-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v3, 0x3480 -; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo -; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 50 +; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x34800000 +; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-GISEL-NEXT: v_not_b32_e32 v3, 21 +; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 25, v3, vcc +; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4c00 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3480 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_not_b32_e32 v3, 21 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 25, v3, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x3480 +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00 %ldexp = fmul bfloat %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir new file mode 100644 index 000000000000..5c7c07632f0d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=default -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=dummy -o - %s | FileCheck -check-prefixes=CHECK,DUMMY %s + +# Check that the regalloc-enable-priority-advisor=dummy option works +# and the result is different from the default. Ordinarily %1 would be +# prioritized higher than %0 due to the register class priority + +--- +name: foo +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; DEFAULT-LABEL: name: foo + ; DEFAULT: liveins: $vgpr0, $vgpr1 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec + ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + ; + ; DUMMY-LABEL: name: foo + ; DUMMY: liveins: $vgpr0, $vgpr1 + ; DUMMY-NEXT: {{ $}} + ; DUMMY-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec + ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + undef %1.sub0:vreg_128 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + %2:vgpr_32 = V_ADD_U32_e32 %1.sub0, %0, implicit $exec + $vgpr3 = COPY %2 + SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1 + +... + +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 73aa87e5c55d..9acb3a42ae10 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -1,64 +1,829 @@ -; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s -; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s -; RUN: not llc -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-GISEL %s target datalayout = "A5" -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s32 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 +; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_lshl2_add_u32 s5, s5, 15 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, -16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 6 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-SDAG-NEXT: s_mov_b32 s32, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-GISEL-NEXT: s_mov_b32 s32, 16 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: s_endpgm %alloca = alloca i32, i32 %n, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 +; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000 +; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10 +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4 +; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000 +; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 10 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 10 +; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 10 +; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 128, addrspace(5) store volatile i32 10, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s32 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 +; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22 +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_lshl2_add_u32 s5, s5, 15 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, -16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 22 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 6 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 22 +; GFX11-SDAG-NEXT: s_mov_b32 s32, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22 +; GFX11-GISEL-NEXT: s_mov_b32 s32, 16 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 2, addrspace(5) store volatile i32 22, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s32, 16 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s32, 16 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca float, i32 %idx, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 +; GFX9-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff +; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc +; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000 +; GFX9-GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000 +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80 +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff +; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc +; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 128, addrspace(5) store volatile i32 444, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 4, 15 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s32, 16 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-SDAG-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s32, 16 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 4, 15 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i128, i32 %idx, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 +; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2 +; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15 +; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff +; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 +; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec +; GFX9-SDAG-NEXT: s_add_i32 s32, s9, s5 +; GFX9-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s5, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s5 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s5 +; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX9-SDAG-NEXT: ; %bb.3: +; GFX9-SDAG-NEXT: s_mov_b32 s5, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1 +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2 +; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15 +; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s32 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.0 +; GFX9-GISEL-NEXT: s_lshl2_add_u32 s5, s5, 15 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, -16 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s5, 6 +; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0xfff +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s6 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9-GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7] +; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9 +; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX9-GISEL-NEXT: ; %bb.3: +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, 6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: .LBB6_4: ; %bb.1 +; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s5, s32 +; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 2 +; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff +; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800 +; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5 +; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo +; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX11-SDAG-NEXT: ; %bb.3: +; GFX11-SDAG-NEXT: s_mov_b32 s3, s32 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s3 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s3 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1 +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 +; GFX11-GISEL-NEXT: s_add_u32 s3, s32, 0x7ff +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_lshl_b32 s4, s1, 5 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_and_b32 s1, s3, 0xfffff800 +; GFX11-GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s4 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s5, v0, s4 +; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX11-GISEL-NEXT: ; %bb.3: +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s2, s2, 5 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s3 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_u32 s32, s3, s2 +; GFX11-GISEL-NEXT: .LBB6_4: ; %bb.1 +; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_endpgm entry: %cond = icmp eq i32 %n, 0 %alloca1 = alloca i32, i32 8, addrspace(5) @@ -77,10 +842,206 @@ bb.1: ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec +; GFX9-SDAG-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s8 +; GFX9-SDAG-NEXT: s_max_u32 s4, s4, s9 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX9-SDAG-NEXT: ; %bb.3: +; GFX9-SDAG-NEXT: s_mov_b32 s6, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s4, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5 +; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0 +; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2 +; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff +; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15 +; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 +; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-SDAG-NEXT: s_add_i32 s32, s4, s5 +; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: .LBB7_5: ; %bb.2 +; GFX9-SDAG-NEXT: s_endpgm +; GFX9-SDAG-NEXT: .LBB7_6: +; GFX9-SDAG-NEXT: s_branch .LBB7_4 +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x1000 +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.1 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9-GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[6:7] +; GFX9-GISEL-NEXT: v_readlane_b32 s9, v0, s4 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s4 +; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s9 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX9-GISEL-NEXT: ; %bb.3: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s8, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX9-GISEL-NEXT: .LBB7_4: ; %Flow +; GFX9-GISEL-NEXT: s_xor_b32 s4, s4, 1 +; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB7_6 +; GFX9-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s5, 15 +; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0xfff +; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: .LBB7_6: ; %bb.2 +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s32, 64 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s4 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX11-SDAG-NEXT: ; %bb.3: +; GFX11-SDAG-NEXT: s_mov_b32 s2, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s2 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5 +; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0 +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15 +; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 +; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800 +; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 +; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2 +; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-NEXT: .LBB7_6: +; GFX11-SDAG-NEXT: s_branch .LBB7_4 +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s32, 64 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 1 +; GFX11-GISEL-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.1 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-GISEL-NEXT: s_bitset0_b32 s0, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s4 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX11-GISEL-NEXT: ; %bb.3: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s2, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s3, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s3 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: .LBB7_4: ; %Flow +; GFX11-GISEL-NEXT: s_xor_b32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_6 +; GFX11-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s1, 15 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0x7ff +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16 +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff800 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: .LBB7_6: ; %bb.2 +; GFX11-GISEL-NEXT: s_endpgm entry: %cond = icmp eq i32 %n, 0 br i1 %cond, label %bb.0, label %bb.1 @@ -97,62 +1058,1113 @@ bb.2: ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_uniform(i32 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 +; GFX11-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 +; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff +; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10 +; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 +; GFX9-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 10 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 +; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10 +; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 +; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB9_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 10 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 128, addrspace(5) store volatile i32 10, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 22 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 +; GFX11-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 22 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22 +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 2, addrspace(5) store volatile i32 22, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_divergent() { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB11_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_divergent_over_aligned() { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 +; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0 +; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 +; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc +; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 +; GFX9-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 +; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff +; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc +; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB12_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 128, addrspace(5) store volatile i32 444, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_divergent_under_aligned() { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB13_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s13, s33 +; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x3000 +; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6 +; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec +; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 +; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s9 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 +; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s11 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX9-SDAG-NEXT: ; %bb.3: +; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff +; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec +; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 +; GFX9-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s11, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s12, v1, s11 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s11 +; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s12 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_4 +; GFX9-SDAG-NEXT: ; %bb.5: +; GFX9-SDAG-NEXT: s_mov_b32 s6, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-SDAG-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s6 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1 +; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s7, v0, s6 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s6 +; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s7 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 +; GFX9-SDAG-NEXT: ; %bb.8: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xd000 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s13 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s13, s33 +; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x3000 +; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-GISEL-NEXT: s_cbranch_execz .LBB14_6 +; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.0 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9-GISEL-NEXT: s_mov_b32 s9, 0 +; GFX9-GISEL-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] +; GFX9-GISEL-NEXT: v_readlane_b32 s11, v1, s10 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s10 +; GFX9-GISEL-NEXT: s_max_u32 s9, s9, s11 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX9-GISEL-NEXT: ; %bb.3: +; GFX9-GISEL-NEXT: s_add_u32 s7, s32, 0xfff +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s9, 6 +; GFX9-GISEL-NEXT: s_and_b32 s9, s7, 0xfffff000 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v2, 2, 15 +; GFX9-GISEL-NEXT: s_add_u32 s32, s9, s6 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9-GISEL-NEXT: s_mov_b32 s10, 0 +; GFX9-GISEL-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s11, s[6:7] +; GFX9-GISEL-NEXT: v_readlane_b32 s12, v1, s11 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s11 +; GFX9-GISEL-NEXT: s_max_u32 s10, s10, s12 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_4 +; GFX9-GISEL-NEXT: ; %bb.5: +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-GISEL-NEXT: s_lshl_b32 s7, s10, 6 +; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7 +; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: .LBB14_6: ; %bb.1 +; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s6, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v0, s6 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s6 +; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s7 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_7 +; GFX9-GISEL-NEXT: ; %bb.8: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s8, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xd000 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s13 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_mov_b32 s7, s33 +; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xc0 +; GFX11-SDAG-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6 +; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX11-SDAG-NEXT: ; %bb.3: +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo +; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 +; GFX11-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5 +; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4 +; GFX11-SDAG-NEXT: ; %bb.5: +; GFX11-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s3, 5, s4 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s2 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s4 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1 +; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v0, 2, 15 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 +; GFX11-SDAG-NEXT: ; %bb.8: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff40 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_mov_b32 s7, s33 +; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xc0 +; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-GISEL-NEXT: s_cbranch_execz .LBB14_6 +; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.0 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v2, v1, 2, 15 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX11-GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX11-GISEL-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s5, v2, s4 +; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX11-GISEL-NEXT: ; %bb.3: +; GFX11-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX11-GISEL-NEXT: s_lshl_b32 s5, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s2, s32, 0x7ff +; GFX11-GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s5 +; GFX11-GISEL-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s5, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s6, v1, s5 +; GFX11-GISEL-NEXT: s_bitset0_b32 s4, s5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s3, s3, s6 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_4 +; GFX11-GISEL-NEXT: ; %bb.5: +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s3, s3, 5 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s4 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_u32 s32, s4, s3 +; GFX11-GISEL-NEXT: .LBB14_6: ; %bb.1 +; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_7 +; GFX11-GISEL-NEXT: ; %bb.8: +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s7 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff40 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %n, 0 %alloca1 = alloca i32, i32 8, addrspace(5) @@ -171,10 +2183,272 @@ bb.1: ret void } -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca - define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s11, s33 +; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x2000 +; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec +; GFX9-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s10, v1, s9 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 +; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX9-SDAG-NEXT: ; %bb.3: +; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff +; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s8, 6, v1 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2 +; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31 +; GFX9-SDAG-NEXT: .LBB15_4: ; %Flow +; GFX9-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_8 +; GFX9-SDAG-NEXT: ; %bb.5: ; %bb.0 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec +; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 +; GFX9-SDAG-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] +; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s9 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 +; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_6 +; GFX9-SDAG-NEXT: ; %bb.7: +; GFX9-SDAG-NEXT: s_mov_b32 s6, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2 +; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xe000 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s11 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s11, s33 +; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x2000 +; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.1 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9-GISEL-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7] +; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9 +; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX9-GISEL-NEXT: ; %bb.3: +; GFX9-GISEL-NEXT: s_add_u32 s7, s32, 0xfff +; GFX9-GISEL-NEXT: s_and_b32 s7, s7, 0xfffff000 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s8, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-GISEL-NEXT: s_add_u32 s32, s7, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: ; implicit-def: $vgpr31 +; GFX9-GISEL-NEXT: .LBB15_4: ; %Flow +; GFX9-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_8 +; GFX9-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 +; GFX9-GISEL-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7] +; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9 +; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB15_6 +; GFX9-GISEL-NEXT: ; %bb.7: +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: .LBB15_8: ; %bb.2 +; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xe000 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s11 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: +; GFX11-SDAG: ; %bb.0: ; %entry +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 +; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x80 +; GFX11-SDAG-NEXT: v_cmpx_ne_u32_e32 0, v0 +; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_4 +; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX11-SDAG-NEXT: ; %bb.3: +; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s1, 5, s2 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow +; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8 +; GFX11-SDAG-NEXT: ; %bb.5: ; %bb.0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6 +; GFX11-SDAG-NEXT: ; %bb.7: +; GFX11-SDAG-NEXT: s_mov_b32 s2, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2 +; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff80 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: +; GFX11-GISEL: ; %bb.0: ; %entry +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_mov_b32 s5, s33 +; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x80 +; GFX11-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v0 +; GFX11-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_4 +; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.1 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX11-GISEL-NEXT: ; %bb.3: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-GISEL-NEXT: s_add_u32 s2, s32, 0x7ff +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr31 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: .LBB15_4: ; %Flow +; GFX11-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_8 +; GFX11-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 +; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_6 +; GFX11-GISEL-NEXT: ; %bb.7: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-GISEL-NEXT: s_mov_b32 s2, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: .LBB15_8: ; %bb.2 +; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff80 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s5 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %n, 0 br i1 %cond, label %bb.0, label %bb.1 @@ -190,3 +2464,257 @@ bb.1: bb.2: ret void } + +define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 +; GFX11-SDAG-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB16_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %alloca = alloca i32, i16 %n, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} + +define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 %n) { +; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec +; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 +; GFX9-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 +; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 +; GFX9-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 +; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 +; GFX11-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 +; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB17_1 +; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %alloca = alloca i32, i64 %n, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll index ebfb5e9ccaa3..a324ba35b155 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -1625,14 +1625,12 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; CODEGEN-IEEE-GISEL: ; %bb.0: ; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x4b800000 ; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0 -; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x45800000 -; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 97d642b991f7..5415af02ef89 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -5249,6 +5249,114 @@ bb: ret void } +define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) { +; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, -16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_add_u32 s0, s0, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GFX10-NEXT: v_add3_u32 v0, s2, s3, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: scratch_store_dword v0, v1, off offset:-16 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_add3_u32 v0, s0, s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm +; +; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX9-PAL: ; %bb.0: ; %bb +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s8 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1 +; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-PAL-NEXT: v_add_u32_e32 v0, -16, v0 +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_endpgm +; +; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX940: ; %bb.0: ; %bb +; GFX940-NEXT: s_add_i32 s0, s0, s1 +; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-NEXT: v_add_u32_e32 v0, -16, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, 15 +; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_endpgm +; +; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX10-PAL: ; %bb.0: ; %bb +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s8 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-PAL-NEXT: v_add3_u32 v0, s0, s1, v0 +; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:-16 +; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: s_endpgm +; +; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX11-PAL: ; %bb.0: ; %bb +; GFX11-PAL-NEXT: v_add3_u32 v0, s0, s1, v0 +; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc +; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: s_endpgm +; +; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: +; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: s_endpgm +bb: + %add1 = add nsw i32 %sidx, %vidx + %add2 = add nsw i32 %add1, -16 + %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2 + store volatile i32 15, ptr addrspace(5) %gep, align 4 + ret void +} + define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) { ; GFX9-LABEL: sgpr_base_negative_offset: ; GFX9: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 005e40159f61..822d40f7349b 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -5,6 +5,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL declare half @llvm.fma.f16(half, half, half) declare half @llvm.maxnum.f16(half, half) @@ -27,6 +29,16 @@ define half @test_fma(half %x, half %y, half %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_fma: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %r } @@ -50,6 +62,16 @@ define half @test_fmac(half %x, half %y, half %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_fmac: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %y, half %z, half %x) ret half %r } @@ -81,6 +103,16 @@ define half @test_fmaak(half %x, half %y, half %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_fmaak: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX12-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200) ret half %r } @@ -112,6 +144,16 @@ define half @test_fmamk(half %x, half %y, half %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_fmamk: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z) ret half %r } @@ -193,6 +235,42 @@ define i32 @test_D139469_f16(half %arg) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: test_D139469_f16: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX12-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX12-SDAG-NEXT: v_min_num_f16_e32 v0, v2, v1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: test_D139469_f16: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX12-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 +; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] bb: %i = fmul contract half %arg, 0xH291E %i1 = fcmp olt half %i, 0xH0000 @@ -306,6 +384,55 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: test_D139469_v2f16: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x211e +; GFX12-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_pk_min_num_f16 v0, v1, v0 +; GFX12-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: test_D139469_v2f16: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e +; GFX12-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 +; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 +; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_or_b32 s0, s1, s2 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] bb: %i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E> %i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000> diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 4b3f0dbbaea9..fbcdbed338e6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: @@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_0_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_1_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_0_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmax3_olt_1_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX11-NEXT: v_pk_max_f16 v0, v2, v0 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_fmax3_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0 +; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max) diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 38b712e044df..269fd52df5c4 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: @@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 %c = load volatile float, ptr addrspace(1) %cptr, align 4 @@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_mov_b32 s22, s10 +; GFX12-NEXT: s_mov_b32 s23, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: s_mov_b32 s20, s6 +; GFX12-NEXT: s_mov_b32 s21, s7 +; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_min3_num_f16 v0, v2, v0, v1 +; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 %c = load volatile half, ptr addrspace(1) %cptr, align 2 @@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; GFX11-NEXT: v_pk_min_f16 v0, v2, v0 ; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_fmin3_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0 +; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min) @@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_0_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 @@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: test_fmin3_olt_1_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s10, -1 +; GFX12-NEXT: s_mov_b32 s11, 0x31016000 +; GFX12-NEXT: s_mov_b32 s14, s10 +; GFX12-NEXT: s_mov_b32 s15, s11 +; GFX12-NEXT: s_mov_b32 s18, s10 +; GFX12-NEXT: s_mov_b32 s19, s11 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s2 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s16, s4 +; GFX12-NEXT: s_mov_b32 s17, s5 +; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s12, s6 +; GFX12-NEXT: s_mov_b32 s13, s7 +; GFX12-NEXT: s_mov_b32 s8, s0 +; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s9, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX12-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 %c = load volatile double, ptr addrspace(1) %cptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 104e157e9e15..9ae60f99d5e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -3307,489 +3307,459 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) { ; -------------------------------------------------------------------- define float @v_mul_f32_select_64_1(i32 %arg, float %x) { -; GFX9-SDAG-LABEL: v_mul_f32_select_64_1: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc -; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f32_select_64_1: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f32_select_64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f32_select_64_1: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f32_select_64_1: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX1011-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %cond = icmp eq i32 %arg, 0 + %select.pow2 = select i1 %cond, float 64.0, float 1.0 + %mul = fmul float %x, %select.pow2 + ret float %mul +} + +define float @v_mul_f32_select_1_64(i32 %arg, float %x) { +; GFX9-LABEL: v_mul_f32_select_1_64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f32_select_64_1: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f32_select_1_64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX1011-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %cond = icmp eq i32 %arg, 0 + %select.pow2 = select i1 %cond, float 1.0, float 64.0 + %mul = fmul float %x, %select.pow2 + ret float %mul +} + +define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) { +; GFX9-LABEL: v_mul_f32_select_n1_n64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f32_select_64_1: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f32_select_n1_n64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX1011-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %cond = icmp eq i32 %arg, 0 + %select.pow2 = select i1 %cond, float -1.0, float -64.0 + %mul = fmul float %x, %select.pow2 + ret float %mul +} + +define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) { +; GFX9-LABEL: v_mul_f32_select_n64_n1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f32_select_64_1: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f32_select_n64_n1: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX1011-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float 64.0, float 1.0 + %select.pow2 = select i1 %cond, float -64.0, float -1.0 %mul = fmul float %x, %select.pow2 ret float %mul } -define float @v_mul_f32_select_1_64(i32 %arg, float %x) { -; GFX9-SDAG-LABEL: v_mul_f32_select_1_64: +define float @v_mul_f32_select_128_64(i32 %arg, float %x) { +; GFX9-SDAG-LABEL: v_mul_f32_select_128_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc ; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: v_mul_f32_select_1_64: +; GFX9-GISEL-LABEL: v_mul_f32_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 1.0, vcc -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f32_select_1_64: +; GFX10-SDAG-LABEL: v_mul_f32_select_128_64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo ; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f32_select_1_64: +; GFX10-GISEL-LABEL: v_mul_f32_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f32_select_1_64: +; GFX11-SDAG-LABEL: v_mul_f32_select_128_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo ; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f32_select_1_64: +; GFX11-GISEL-LABEL: v_mul_f32_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float 1.0, float 64.0 + %select.pow2 = select i1 %cond, float 128.0, float 64.0 %mul = fmul float %x, %select.pow2 ret float %mul } -define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) { -; GFX9-SDAG-LABEL: v_mul_f32_select_n1_n64: +define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) { +; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc ; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: v_mul_f32_select_n1_n64: +; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1.0, vcc -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f32_select_n1_n64: +; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo ; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f32_select_n1_n64: +; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f32_select_n1_n64: +; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo ; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f32_select_n1_n64: +; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float -1.0, float -64.0 + %select.pow2 = select i1 %cond, float -128.0, float -64.0 %mul = fmul float %x, %select.pow2 ret float %mul } -define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) { -; GFX9-SDAG-LABEL: v_mul_f32_select_n64_n1: +define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) { +; GFX9-LABEL: v_mul_f32_select_n128_n16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc +; GFX9-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_mul_f32_select_n128_n16: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX1011-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %cond = icmp eq i32 %arg, 0 + %select.pow2 = select i1 %cond, float -128.0, float -16.0 + %mul = fmul float %x, %select.pow2 + ret float %mul +} + +define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) { +; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_64_1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc -; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: v_mul_f32_select_n64_n1: +; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_64_1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f32_select_n64_n1: +; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_64_1: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo +; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f32_select_n64_n1: +; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_64_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f32_select_n64_n1: +; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_64_1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo +; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f32_select_n64_n1: +; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_64_1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float -64.0, float -1.0 - %mul = fmul float %x, %select.pow2 - ret float %mul + %select.pow2 = select contract i1 %cond, float 64.0, float 1.0 + %mul = fmul contract float %x, %select.pow2 + %fma = fadd contract float %mul, %y + ret float %fma } -define float @v_mul_f32_select_128_64(i32 %arg, float %x) { -; GFX9-SDAG-LABEL: v_mul_f32_select_128_64: +define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) { +; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_1_64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc -; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, 1.0, vcc +; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: v_mul_f32_select_128_64: +; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_1_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x43000000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f32_select_128_64: +; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_1_64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo +; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f32_select_128_64: +; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_1_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f32_select_128_64: +; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_1_64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo +; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f32_select_128_64: +; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_1_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float 128.0, float 64.0 - %mul = fmul float %x, %select.pow2 - ret float %mul + %select.pow2 = select contract i1 %cond, float 1.0, float 64.0 + %mul = fmul contract float %x, %select.pow2 + %fma = fadd contract float %mul, %y + ret float %fma } -define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) { -; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64: +define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) { +; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2800000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc -; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, -1.0, v3, vcc +; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64: +; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc3000000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2800000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64: +; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo +; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64: +; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64: +; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo +; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64: +; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float -128.0, float -64.0 - %mul = fmul float %x, %select.pow2 - ret float %mul + %select.pow2 = select contract i1 %cond, float -64.0, float -1.0 + %mul = fmul contract float %x, %select.pow2 + %fma = fadd contract float %mul, %y + ret float %fma } -define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) { -; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n16: +define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) { +; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2800000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc -; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, -1.0, vcc +; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n16: +; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc3000000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1800000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n16: +; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo +; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n16: +; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xc1800000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n16: +; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo +; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n16: +; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xc1800000 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0 +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float -128.0, float -16.0 - %mul = fmul float %x, %select.pow2 - ret float %mul -} - -define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) { -; GFX9-LABEL: v_contract_mul_add_f32_select_64_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1011-LABEL: v_contract_mul_add_f32_select_64_1: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo -; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX1011-NEXT: s_setpc_b64 s[30:31] - %cond = icmp eq i32 %arg, 0 - %select.pow2 = select contract i1 %cond, float 64.0, float 1.0 - %mul = fmul contract float %x, %select.pow2 - %fma = fadd contract float %mul, %y - ret float %fma -} - -define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) { -; GFX9-LABEL: v_contract_mul_add_f32_select_1_64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 1.0, vcc -; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1011-LABEL: v_contract_mul_add_f32_select_1_64: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo -; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX1011-NEXT: s_setpc_b64 s[30:31] - %cond = icmp eq i32 %arg, 0 - %select.pow2 = select contract i1 %cond, float 1.0, float 64.0 - %mul = fmul contract float %x, %select.pow2 - %fma = fadd contract float %mul, %y - ret float %fma -} - -define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) { -; GFX9-LABEL: v_contract_mul_add_f32_select_n64_n1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xc2800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, -1.0, v3, vcc -; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1011-LABEL: v_contract_mul_add_f32_select_n64_n1: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo -; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX1011-NEXT: s_setpc_b64 s[30:31] - %cond = icmp eq i32 %arg, 0 - %select.pow2 = select contract i1 %cond, float -64.0, float -1.0 - %mul = fmul contract float %x, %select.pow2 - %fma = fadd contract float %mul, %y - ret float %fma -} - -define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) { -; GFX9-LABEL: v_contract_mul_add_f32_select_n1_n64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xc2800000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, -1.0, vcc -; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1011-LABEL: v_contract_mul_add_f32_select_n1_n64: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo -; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX1011-NEXT: s_setpc_b64 s[30:31] - %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, float -1.0, float -64.0 %mul = fmul contract float %x, %select.pow2 %fma = fadd contract float %mul, %y @@ -3810,11 +3780,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y) ; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x43000000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x42800000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f32 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_64: @@ -3829,10 +3799,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y) ; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f32 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_64: @@ -3847,10 +3818,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y) ; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f32 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, float 128.0, float 64.0 @@ -3860,22 +3832,57 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y) } define float @v_contract_mul_add_f32_select_128_4(i32 %arg, float %x, float %y) { -; GFX9-LABEL: v_contract_mul_add_f32_select_128_4: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x43000000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 4.0, v3, vcc -; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_128_4: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x43000000 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 4.0, v3, vcc +; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_contract_mul_add_f32_select_128_4: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo -; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_4: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_4: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo +; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_4: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_4: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo +; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_4: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, float 128.0, float 4.0 %mul = fmul contract float %x, %select.pow2 @@ -3907,143 +3914,102 @@ define float @v_contract_mul_add_f32_select_2_4(i32 %arg, float %x, float %y) { } define float @v_contract_mul_add_f32_select_4_128(i32 %arg, float %x, float %y) { -; GFX9-LABEL: v_contract_mul_add_f32_select_4_128: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x43000000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 4.0, vcc -; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX1011-LABEL: v_contract_mul_add_f32_select_4_128: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo -; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2 -; GFX1011-NEXT: s_setpc_b64 s[30:31] - %cond = icmp eq i32 %arg, 0 - %select.pow2 = select i1 %cond, float 4.0, float 128.0 - %mul = fmul contract float %x, %select.pow2 - %fma = fadd contract float %mul, %y - ret float %fma -} - -define double @v_mul_f64_select_64_1(i32 %arg, double %x) { -; GFX9-SDAG-LABEL: v_mul_f64_select_64_1: +; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_4_128: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x43000000 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc -; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, 4.0, vcc +; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-GISEL-LABEL: v_mul_f64_select_64_1: +; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_4_128: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x40500000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x3ff00000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_mul_f64_select_64_1: +; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_4_128: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo +; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: v_mul_f64_select_64_1: +; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_4_128: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_mul_f64_select_64_1: +; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_4_128: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo +; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f64_select_64_1: +; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_4_128: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 + %select.pow2 = select i1 %cond, float 4.0, float 128.0 + %mul = fmul contract float %x, %select.pow2 + %fma = fadd contract float %mul, %y + ret float %fma +} + +define double @v_mul_f64_select_64_1(i32 %arg, double %x) { +; GFX9-LABEL: v_mul_f64_select_64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_mul_f64_select_64_1: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX1011-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double 64.0, double 1.0 %mul = fmul double %x, %select.pow2 ret double %mul } define double @v_mul_f64_select_1_64(i32 %arg, double %x) { -; GFX9-SDAG-LABEL: v_mul_f64_select_1_64: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc -; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f64_select_1_64: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3ff00000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40500000 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_f64_select_1_64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_f64_select_1_64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x40500000 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_f64_select_1_64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f64_select_1_64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f64_select_1_64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f64_select_1_64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX1011-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double 1.0, double 64.0 %mul = fmul double %x, %select.pow2 @@ -4051,59 +4017,21 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) { } define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) { -; GFX9-SDAG-LABEL: v_mul_f64_select_n1_n64: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc -; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f64_select_n1_n64: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xbff00000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0500000 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_f64_select_n1_n64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_f64_select_n1_n64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0500000 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_f64_select_n1_n64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f64_select_n1_n64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f64_select_n1_n64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f64_select_n1_n64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX1011-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double -1.0, double -64.0 %mul = fmul double %x, %select.pow2 @@ -4122,12 +4050,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) { ; GFX9-GISEL-LABEL: v_mul_f64_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x40600000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40500000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_f64_select_128_64: @@ -4141,11 +4067,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) { ; GFX10-GISEL-LABEL: v_mul_f64_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x40500000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_f64_select_128_64: @@ -4159,10 +4084,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) { ; GFX11-GISEL-LABEL: v_mul_f64_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double 128.0, double 64.0 @@ -4182,12 +4107,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) { ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0600000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0500000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n64: @@ -4201,11 +4124,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) { ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0500000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n64: @@ -4219,10 +4141,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) { ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double -128.0, double -64.0 @@ -4231,59 +4153,21 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) { } define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) { -; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n16: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc -; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n16: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0600000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0300000 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0300000 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f64_select_n128_n16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc +; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0xc0300000 :: v_dual_mov_b32 v3, 0 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4] -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f64_select_n128_n16: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX1011-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double -128.0, double -16.0 %mul = fmul double %x, %select.pow2 @@ -4305,12 +4189,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_64_1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40500000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x3ff00000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_64_1: @@ -4326,11 +4208,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_64_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_64_1: @@ -4345,10 +4226,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_64_1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x3ff00000 :: v_dual_mov_b32 v5, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, double 64.0, double 1.0 @@ -4372,12 +4253,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_1_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40500000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_1_64: @@ -4393,11 +4272,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_1_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40500000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_1_64: @@ -4412,10 +4290,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_1_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, double 1.0, double 64.0 @@ -4439,12 +4317,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0xc0500000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0xbff00000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1: @@ -4460,11 +4336,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0xbff00000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1: @@ -4479,10 +4354,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0xbff00000 :: v_dual_mov_b32 v5, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, double -64.0, double -1.0 @@ -4506,12 +4381,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0xbff00000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0xc0500000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64: @@ -4527,11 +4400,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0xc0500000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64: @@ -4546,10 +4418,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0xc0500000 :: v_dual_mov_b32 v5, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, double -1.0, double -64.0 @@ -4573,12 +4445,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40600000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40500000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_64: @@ -4594,11 +4465,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40500000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_64: @@ -4613,10 +4484,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double 128.0, double 64.0 @@ -4640,12 +4512,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double % ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_4: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40600000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40100000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_4: @@ -4661,11 +4531,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double % ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_4: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40100000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_4: @@ -4680,10 +4549,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double % ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_4: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40100000 :: v_dual_mov_b32 v5, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double 128.0, double 4.0 @@ -4706,21 +4575,50 @@ define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y) ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_2_4: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40100000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 2.0, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1011-LABEL: v_contract_mul_add_f64_select_2_4: -; GFX1011: ; %bb.0: -; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1011-NEXT: v_mov_b32_e32 v5, 0 -; GFX1011-NEXT: v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo -; GFX1011-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] -; GFX1011-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_2_4: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo +; GFX10-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_2_4: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_2_4: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo +; GFX11-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_2_4: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double 2.0, double 4.0 %mul = fmul contract double %x, %select.pow2 @@ -4743,12 +4641,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double % ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_4_128: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40100000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40600000 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc +; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_4_128: @@ -4764,11 +4660,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double % ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_4_128: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40600000 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_4_128: @@ -4783,10 +4678,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double % ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_4_128: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4] +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, double 4.0, double 128.0 @@ -4796,57 +4691,21 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double % } define half @v_mul_f16_select_64_1(i32 %arg, half %x) { -; GFX9-SDAG-LABEL: v_mul_f16_select_64_1: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f16_select_64_1: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_f16_select_64_1: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_f16_select_64_1: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_f16_select_64_1: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f16_select_64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f16_select_64_1: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f16_select_64_1: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX1011-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half 64.0, half 1.0 %mul = fmul half %x, %select.pow2 @@ -4854,57 +4713,21 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) { } define half @v_mul_f16_select_1_64(i32 %arg, half %x) { -; GFX9-SDAG-LABEL: v_mul_f16_select_1_64: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc -; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f16_select_1_64: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_f16_select_1_64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_f16_select_1_64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_f16_select_1_64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f16_select_1_64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f16_select_1_64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f16_select_1_64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX1011-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half 1.0, half 64.0 %mul = fmul half %x, %select.pow2 @@ -4912,57 +4735,21 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) { } define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) { -; GFX9-SDAG-LABEL: v_mul_f16_select_n1_n64: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc -; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f16_select_n1_n64: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xbc00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_f16_select_n1_n64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_f16_select_n1_n64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_f16_select_n1_n64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f16_select_n1_n64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f16_select_n1_n64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX1011-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half -1.0, half -64.0 %mul = fmul half %x, %select.pow2 @@ -4981,11 +4768,13 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) { ; GFX9-GISEL-LABEL: v_mul_f16_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x5800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_f16_select_128_64: @@ -4999,10 +4788,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) { ; GFX10-GISEL-LABEL: v_mul_f16_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_f16_select_128_64: @@ -5016,10 +4807,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) { ; GFX11-GISEL-LABEL: v_mul_f16_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half 128.0, half 64.0 @@ -5039,11 +4832,13 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) { ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xd800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n64: @@ -5057,10 +4852,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) { ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2 +; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64: @@ -5074,10 +4871,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) { ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half -128.0, half -64.0 @@ -5086,57 +4885,21 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) { } define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) { -; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n16: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc -; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n16: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xd800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xcc00 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo -; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xcc00 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo -; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: v_mul_f16_select_n128_n16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc +; GFX9-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xcc00 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_mul_f16_select_n128_n16: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo +; GFX1011-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half -128.0, half -16.0 %mul = fmul half %x, %select.pow2 @@ -5157,11 +4920,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_64_1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_64_1: @@ -5176,10 +4938,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_64_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1: @@ -5194,10 +4956,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, half 64.0, half 1.0 @@ -5220,11 +4982,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_1_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_1_64: @@ -5239,10 +5000,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_1_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64: @@ -5257,10 +5018,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, half 1.0, half 64.0 @@ -5283,11 +5044,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1: @@ -5302,10 +5062,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1: @@ -5320,10 +5080,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, half -64.0, half -1.0 @@ -5346,11 +5106,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64: @@ -5365,10 +5124,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64: @@ -5383,10 +5142,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select contract i1 %cond, half -1.0, half -64.0 @@ -5409,11 +5168,14 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_64: @@ -5428,10 +5190,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64: @@ -5446,10 +5211,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half 128.0, half 64.0 @@ -5472,11 +5240,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_4: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_4: @@ -5491,10 +5258,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_4: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4: @@ -5509,10 +5276,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half 128.0, half 4.0 @@ -5535,11 +5302,14 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_2_4: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_2_4: @@ -5554,10 +5324,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_2_4: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4: @@ -5572,10 +5345,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half 2.0, half 4.0 @@ -5598,11 +5374,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) { ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_4_128: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_4_128: @@ -5617,10 +5392,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) { ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_4_128: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo -; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128: @@ -5635,10 +5410,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) { ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo -; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0 +; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq i32 %arg, 0 %select.pow2 = select i1 %cond, half 4.0, half 128.0 @@ -5664,15 +5439,13 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) { ; GFX9-GISEL-LABEL: v_mul_v2f16_select_64_1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_v2f16_select_64_1: @@ -5690,14 +5463,14 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) { ; GFX10-GISEL-LABEL: v_mul_v2f16_select_64_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_v2f16_select_64_1: @@ -5715,14 +5488,15 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) { ; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0> @@ -5747,15 +5521,13 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) { ; GFX9-GISEL-LABEL: v_mul_v2f16_select_1_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_v2f16_select_1_64: @@ -5773,14 +5545,14 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) { ; GFX10-GISEL-LABEL: v_mul_v2f16_select_1_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_v2f16_select_1_64: @@ -5798,14 +5570,15 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0> @@ -5830,15 +5603,14 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n1_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n1_n64: @@ -5856,14 +5628,15 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n1_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo +; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n1_n64: @@ -5881,14 +5654,16 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo +; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0> @@ -5913,15 +5688,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) { ; GFX9-GISEL-LABEL: v_mul_v2f16_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 6, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4 +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_v2f16_select_128_64: @@ -5939,14 +5718,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) { ; GFX10-GISEL-LABEL: v_mul_v2f16_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_v2f16_select_128_64: @@ -5964,14 +5748,20 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0> @@ -5996,15 +5786,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 6, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4 +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n64: @@ -6022,14 +5817,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n64: @@ -6047,14 +5848,21 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) { ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo +; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -64.0, half -64.0> @@ -6079,15 +5887,14 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) { ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xcc00 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 4, 7, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n16: @@ -6105,14 +5912,15 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) { ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n16: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xcc00 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo +; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 4, 7, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n16: @@ -6130,14 +5938,16 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) { ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xcc00 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo +; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 4, 7, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -16.0, half -16.0> @@ -6162,15 +5972,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x3c00 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1: @@ -6188,14 +5997,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1: @@ -6213,14 +6023,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0> @@ -6246,15 +6058,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x5400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64: @@ -6272,14 +6083,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64: @@ -6297,14 +6109,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0> @@ -6330,15 +6144,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xbc00 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1: @@ -6356,14 +6170,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo +; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1: @@ -6381,14 +6197,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo +; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half -64.0, half -64.0>, <2 x half> <half -1.0, half -1.0> @@ -6414,15 +6233,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xd400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64: @@ -6440,14 +6259,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo +; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64: @@ -6465,14 +6286,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo +; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0> @@ -6498,15 +6322,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x5400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 6, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v4, v5 +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v4, v5 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64: @@ -6524,14 +6353,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64: @@ -6549,14 +6384,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4 +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v5, v1 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0> @@ -6582,15 +6423,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x4400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 2, 7, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4: @@ -6608,14 +6448,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 2, 7, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4: @@ -6633,14 +6474,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 2, 7, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 4.0, half 4.0> @@ -6666,15 +6509,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x4400 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 2, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v4, v5 +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v4, v5 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4: @@ -6692,14 +6540,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 2, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4: @@ -6717,14 +6571,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4 +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v5, v1 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 4.0, half 4.0> @@ -6750,15 +6610,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x5800 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 7, 2, vcc +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128: @@ -6776,14 +6635,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 7, 2, vcc_lo ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo +; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128: @@ -6801,14 +6661,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800 ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo ; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 7, 2, vcc_lo ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %cond = icmp eq <2 x i32> %arg, zeroinitializer %select.pow2 = select <2 x i1> %cond, <2 x half> <half 4.0, half 4.0>, <2 x half> <half 128.0, half 128.0> diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index f6ee007facd7..80b4d64b1236 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -14,6 +14,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s ; Test patterns to match v_fract_* instructions. @@ -103,6 +104,21 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX11-NEXT: global_store_b32 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX12-NEXT: global_store_b32 v[1:2], v4, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -181,6 +197,18 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f32_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -263,6 +291,22 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w ; GFX11-NEXT: v_min_f32_e32 v4, 0x3f7fffff, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: no_nan_check_math_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: v_min_num_f32_e32 v4, 0x3f7fffff, v4 +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -314,6 +358,16 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -362,6 +416,19 @@ define float @basic_fract_f32_flags_minnum(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_flags_minnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -409,6 +476,16 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_flags_fsub: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub nsz float %x, %floor @@ -467,6 +544,17 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) { ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: v_fract_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_v2f32_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: v_fract_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) %sub = fsub <2 x float> %x, %floor @@ -540,6 +628,20 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f32_multi_use_fsub_nonans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v3, v0, v3 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: global_store_b32 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -588,6 +690,16 @@ define float @nnan_minnum_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_minnum_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -638,6 +750,19 @@ define float @nnan_fsub_fract_f32(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_fsub_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub nnan float %x, %floor @@ -686,6 +811,19 @@ define float @nnan_floor_fract_f32(float %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_floor_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call nnan float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -733,6 +871,16 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: nnan_src_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -782,6 +930,19 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7ffffe, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_wrong_const: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7ffffe, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -831,6 +992,19 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_swapped_fsub: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %floor, %x @@ -880,6 +1054,19 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_not_floor: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_trunc_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.trunc.f32(float %x) %sub = fsub float %x, %floor @@ -929,6 +1116,19 @@ define float @not_fract_f32_different_floor(float %x, float %y) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_different_floor: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %y) %sub = fsub float %x, %floor @@ -978,6 +1178,19 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) { ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX11-NEXT: v_max_f32_e32 v0, 0x3f7fffff, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: not_fract_f32_maxnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_max_num_f32_e32 v0, 0x3f7fffff, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1000,6 +1213,15 @@ define float @fcmp_uno_check_is_nan_f32(float %x) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fcmp_uno_check_is_nan_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1054,6 +1276,16 @@ define float @select_nan_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1107,6 +1339,16 @@ define float @commuted_select_nan_fract_f32(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: commuted_select_nan_fract_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1168,6 +1410,22 @@ define float @wrong_commuted_nan_select_f32(float %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: wrong_commuted_nan_select_f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f32_e32 v1, v0 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v1, v0, v1 +; GFX12-NEXT: v_min_num_f32_e32 v1, 0x3f7fffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1231,6 +1489,16 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f16_nonan: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -1313,6 +1581,20 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) { ; GFX11-NEXT: v_fract_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_v2f16_nonan: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_fract_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) %sub = fsub <2 x half> %x, %floor @@ -1369,6 +1651,16 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: basic_fract_f64_nanans: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -1461,6 +1753,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture ; GFX11-NEXT: v_fract_f16_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v[1:2], v3, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f16_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f16_e32 v3, v0 +; GFX12-NEXT: v_fract_f16_e32 v0, v0 +; GFX12-NEXT: global_store_b16 v[1:2], v3, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -1546,6 +1850,18 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap ; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f64_noinf_check: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -1600,6 +1916,16 @@ define float @select_nan_fract_f32_flags_select(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32_flags_select: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1653,6 +1979,16 @@ define float @select_nan_fract_f32_flags_minnum(float %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: select_nan_fract_f32_flags_minnum: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call float @llvm.floor.f32(float %x) %sub = fsub float %x, %floor @@ -1769,6 +2105,25 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX11-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f32_e32 v6, v0 +; GFX12-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204 +; GFX12-NEXT: v_fract_f32_e32 v7, v1 +; GFX12-NEXT: v_floor_f32_e32 v4, v0 +; GFX12-NEXT: v_floor_f32_e32 v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX12-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204 +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) %sub = fsub <2 x float> %x, %floor @@ -1881,6 +2236,21 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon ; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 ; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 +; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) %sub = fsub double %x, %floor @@ -2002,6 +2372,21 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX11-NEXT: global_store_b16 v[1:2], v4, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f16_e32 v3, v0 +; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| +; GFX12-NEXT: v_floor_f16_e32 v4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo +; GFX12-NEXT: global_store_b16 v[1:2], v4, off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call half @llvm.floor.f16(half %x) %sub = fsub half %x, %floor @@ -2168,6 +2553,29 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX11-NEXT: global_store_b32 v[1:2], v4, off ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: v_fract_f16_e32 v6, v0 +; GFX12-NEXT: v_floor_f16_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_fract_f16_e32 v4, v3 +; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 +; GFX12-NEXT: v_floor_f16_e32 v7, v3 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX12-NEXT: global_store_b32 v[1:2], v4, off +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x) %sub = fsub <2 x half> %x, %floor @@ -2311,6 +2719,26 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX11-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: safe_math_fract_v2f64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_fract_f64_e32 v[10:11], v[0:1] +; GFX12-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 0x204 +; GFX12-NEXT: v_fract_f64_e32 v[12:13], v[2:3] +; GFX12-NEXT: v_cmp_class_f64_e64 s1, v[2:3], 0x204 +; GFX12-NEXT: v_floor_f64_e32 v[8:9], v[2:3] +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v0, v10, 0, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v1, v11, 0, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v12, 0, s1 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1 +; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %x) %sub = fsub <2 x double> %x, %floor diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index b3001819e9aa..c1d5b5857b6b 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -2380,14 +2380,12 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) { ; GISEL-IEEE: ; %bb.0: ; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000 -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4b800000 ; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GISEL-IEEE-NEXT: v_rsq_f32_e32 v0, v0 -; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x45800000 -; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp: @@ -2734,20 +2732,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) { ; GISEL-IEEE: ; %bb.0: ; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x800000 -; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0x4b800000 ; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, 0, 24, vcc ; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4 -; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[4:5] +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v3 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, s[4:5] ; GISEL-IEEE-NEXT: v_rsq_f32_e32 v0, v0 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GISEL-IEEE-NEXT: v_rsq_f32_e32 v1, v1 -; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0x45800000 -; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, s[4:5] +; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp: diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir index cefd24032871..85c657789339 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir @@ -18,7 +18,7 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0 ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec - ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec %0 = IMPLICIT_DEF %1 = COPY %0.sub1 %2 = COPY %0.sub0 @@ -43,7 +43,7 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0 ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec - ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec %0 = IMPLICIT_DEF %1 = COPY %0.sub1 %2 = COPY %0.sub0 @@ -68,7 +68,7 @@ body: | ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1 ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec - ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec %0 = IMPLICIT_DEF %1 = COPY %0.sub0 %2 = COPY %0.sub1 @@ -90,7 +90,7 @@ body: | ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec - ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 %0:vgpr_32 = COPY killed $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 157f91ccc6b1..b2f113f08a91 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] -; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX12-SDAG-NEXT: s_mov_b32 s1, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, 1 +; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace( define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_i32 s3, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] -; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s3, s3, -1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, -1 +; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir new file mode 100644 index 000000000000..503f27edf70d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir @@ -0,0 +1,204 @@ +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -simplify-mir -start-before=greedy,2 -stress-regalloc=4 -stop-before=virtregrewriter,2 -o - -verify-regalloc %s 2> %t.err | FileCheck %s +# RUN: FileCheck -check-prefix=ERR %s < %t.err + +# To allocate the vreg_512_align2, the allocation will attempt to +# inflate the register class to av_512_align2. This will ultimately +# not work, and the allocation will fail. There is an unproductive +# live range split, and we end up with a snippet copy of an +# unspillable register. Recursive assignment of interfering ranges +# during last chance recoloring would delete the unspillable snippet +# live range. Make sure there's no use after free when rolling back +# the last chance assignment. + +# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free' +# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free_lane_subset' + +--- | + define amdgpu_kernel void @inflated_reg_class_copy_use_after_free() { + ret void + } + + define amdgpu_kernel void @inflated_reg_class_copy_use_after_free_lane_subset() { + ret void + } + +... + +# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free +# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3 +# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) +# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: early-clobber [[MFMA0:%[0-9]+]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec +# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[MFMA0]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[MFMA0]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5) +# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1 +# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5) +# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3 +# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2 +# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2 +# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 { +# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2 +# CHECK-NEXT: } +# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2 +# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3 +# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3 +# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0 +# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[MFMA_USE1]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[MFMA_USE1]], 0, 0, 0, implicit $mode, implicit $exec + +--- +name: inflated_reg_class_copy_use_after_free +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75' + stackPtrOffsetReg: '$sgpr32' + occupancy: 7 + vgprForAGPRCopy: '$vgpr255' + sgprForEXECCopy: '$sgpr74_sgpr75' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + + %0:vgpr_32 = IMPLICIT_DEF + renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4) + S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2 + S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2 + S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2 + S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2 + early-clobber %2:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec + %1.sub2:vreg_512_align2 = COPY %2.sub3 + %1.sub3:vreg_512_align2 = COPY %2.sub2 + %1.sub4:vreg_512_align2 = COPY %2.sub0 + %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... + +# This test is similar to except it is still broken when the use +# instruction does not read the full set of lanes after one attempted fix. + +# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free_lane_subset +# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3 +# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) +# CHECK-NEXT: [[RESTORE_0:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: S_NOP 0, implicit-def early-clobber [[REG1:%[0-9]+]], implicit [[RESTORE_0]].sub0_sub1_sub2_sub3, implicit [[RESTORE_0]].sub4_sub5_sub6_sub7 +# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[REG1]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[REG1]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5) +# CHECK-NEXT: [[RESTORE_1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE_1]].sub0_sub1 +# CHECK-NEXT: [[RESTORE_2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5) +# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE_2]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[RESTORE_2]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: undef [[SPLIT5:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT4]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT5]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 +# CHECK-NEXT: } +# CHECK-NEXT: [[SPLIT3]].sub2:av_512_align2 = COPY [[SPLIT5]].sub3 +# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT3]].sub0_sub1_sub2 +# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2 +# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT5]].sub0 { +# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT5]].sub2 +# CHECK-NEXT: } +# CHECK-NEXT: [[SPLIT7]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2 +# CHECK-NEXT: undef [[SPLIT9:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT7]].sub0_sub1_sub2_sub3 +# CHECK-NEXT: undef [[LAST_USE:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3 +# CHECK-NEXT: [[LAST_USE]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0 +# CHECK-NEXT: [[LAST_USE]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: [[LAST_USE]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec +# CHECK-NEXT: S_NOP 0, implicit-def [[LAST_USE]], implicit [[LAST_USE]].sub0_sub1_sub2_sub3, implicit [[LAST_USE]].sub4_sub5_sub6_sub7, implicit [[LAST_USE]].sub8_sub9_sub10_sub11 + +--- +name: inflated_reg_class_copy_use_after_free_lane_subset +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75' + stackPtrOffsetReg: '$sgpr32' + occupancy: 7 + vgprForAGPRCopy: '$vgpr255' + sgprForEXECCopy: '$sgpr74_sgpr75' +body: | + bb.0: + liveins: $vgpr0, $sgpr4_sgpr5 + + %0:vgpr_32 = IMPLICIT_DEF + renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4) + S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2 + S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2 + S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2 + S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2 + S_NOP 0, implicit-def early-clobber %2:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7 + %1.sub2:vreg_512_align2 = COPY %2.sub3 + %1.sub3:vreg_512_align2 = COPY %2.sub2 + %1.sub4:vreg_512_align2 = COPY %2.sub0 + %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec + S_NOP 0, implicit-def %1:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7, implicit %1.sub8_sub9_sub10_sub11 + GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index cf9fdbdc3439..2ceaca3497ec 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %12 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %10 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll new file mode 100644 index 000000000000..1a87887e28d7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s + +define <4 x float> @issue121601(bfloat %fptrunc) { +; CHECK-LABEL: issue121601: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + %bitcast = bitcast bfloat %fptrunc to <1 x bfloat> + %shufflevector = shufflevector <1 x bfloat> %bitcast, <1 x bfloat> zeroinitializer, <2 x i32> zeroinitializer + %fpext = fpext <2 x bfloat> %shufflevector to <2 x float> + %shufflevector1 = shufflevector <2 x float> %fpext, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %shufflevector1 +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 3ff759a5cdb9..867025adca94 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: @@ -80,6 +81,19 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_cos_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: cos_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.cos.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -188,6 +202,24 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: cos_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_cos_f16_e32 v2, v2 +; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index ac515808a0d8..333d428c84bc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -41,10 +41,10 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-GISEL-NEXT: s_mov_b32 s2, -1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -78,9 +78,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v2, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -115,9 +115,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; SI-GISEL-NEXT: v_not_b32_e32 v2, 63 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 @@ -213,10 +213,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-GISEL-NEXT: v_add_f32_e32 v0, s7, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v3, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -252,7 +252,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; VI-GISEL-NEXT: v_not_b32_e32 v2, 63 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 @@ -262,10 +262,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL-NEXT: v_add_f32_e32 v0, s7, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v3, v3 ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v3, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; VI-GISEL-NEXT: v_ldexp_f32 v0, v3, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v2 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -300,7 +300,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 +; GFX900-GISEL-NEXT: v_not_b32_e32 v2, 63 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 @@ -310,10 +310,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s11, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v3, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm @@ -421,17 +421,17 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; SI-GISEL-NEXT: v_not_b32_e32 v3, 63 ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; SI-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc ; SI-GISEL-NEXT: v_add_f32_e32 v4, s1, v4 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 @@ -439,11 +439,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-GISEL-NEXT: v_exp_f32_e32 v4, v4 ; SI-GISEL-NEXT: v_add_f32_e32 v1, s2, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm @@ -487,16 +487,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; VI-GISEL-NEXT: v_not_b32_e32 v3, 63 ; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; VI-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc ; VI-GISEL-NEXT: v_add_f32_e32 v4, s1, v4 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 @@ -504,10 +504,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_add_f32_e32 v1, s2, v1 ; VI-GISEL-NEXT: v_exp_f32_e32 v4, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; VI-GISEL-NEXT: v_ldexp_f32 v1, v4, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] @@ -551,15 +551,15 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 +; GFX900-GISEL-NEXT: v_not_b32_e32 v3, 63 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc ; GFX900-GISEL-NEXT: v_add_f32_e32 v4, s1, v4 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 @@ -567,10 +567,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s2, v1 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v4, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm @@ -710,7 +710,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; SI-GISEL-NEXT: v_not_b32_e32 v4, 63 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc @@ -720,22 +720,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] ; SI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 ; SI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v4 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -787,7 +787,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; VI-GISEL-NEXT: v_not_b32_e32 v4, 63 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc @@ -797,22 +797,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 +; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v5 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] ; VI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; VI-GISEL-NEXT: v_exp_f32_e32 v3, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; VI-GISEL-NEXT: v_ldexp_f32 v2, v5, v2 +; VI-GISEL-NEXT: v_ldexp_f32 v3, v3, v4 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -863,7 +863,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 +; GFX900-GISEL-NEXT: v_not_b32_e32 v4, 63 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc @@ -873,22 +873,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v1, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_ldexp_f32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm @@ -1006,19 +1006,19 @@ define float @v_exp2_f32(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32: ; VI-SDAG: ; %bb.0: @@ -1034,6 +1034,20 @@ define float @v_exp2_f32(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1048,6 +1062,20 @@ define float @v_exp2_f32(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1076,19 +1104,19 @@ define float @v_exp2_fabs_f32(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_fabs_f32: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_fabs_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fabs_f32: ; VI-SDAG: ; %bb.0: @@ -1104,6 +1132,20 @@ define float @v_exp2_fabs_f32(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_fabs_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_fabs_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1118,6 +1160,20 @@ define float @v_exp2_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_fabs_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_fabs_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1147,19 +1203,19 @@ define float @v_exp2_fneg_fabs_f32(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0| -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_fneg_fabs_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0| +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fneg_fabs_f32: ; VI-SDAG: ; %bb.0: @@ -1175,6 +1231,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_fneg_fabs_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0| +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1189,6 +1259,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_fneg_fabs_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1219,19 +1303,19 @@ define float @v_exp2_fneg_f32(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_fneg_f32: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_fneg_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_fneg_f32: ; VI-SDAG: ; %bb.0: @@ -1247,6 +1331,20 @@ define float @v_exp2_fneg_f32(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_fneg_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_fneg_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1261,6 +1359,20 @@ define float @v_exp2_fneg_f32(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_fneg_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_fneg_f32: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1290,19 +1402,19 @@ define float @v_exp2_f32_fast(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_fast: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_fast: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_fast: ; VI-SDAG: ; %bb.0: @@ -1318,6 +1430,20 @@ define float @v_exp2_f32_fast(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_fast: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1332,6 +1458,20 @@ define float @v_exp2_f32_fast(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_fast: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_fast: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1360,19 +1500,19 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_unsafe_math_attr: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr: ; VI-SDAG: ; %bb.0: @@ -1388,6 +1528,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_unsafe_math_attr: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1402,6 +1556,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_unsafe_math_attr: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_unsafe_math_attr: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1430,19 +1598,19 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_approx_fn_attr: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_approx_fn_attr: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_approx_fn_attr: ; VI-SDAG: ; %bb.0: @@ -1458,6 +1626,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_approx_fn_attr: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_approx_fn_attr: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1472,6 +1654,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_approx_fn_attr: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_approx_fn_attr: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1500,19 +1696,19 @@ define float @v_exp2_f32_ninf(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_ninf: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_ninf: ; VI-SDAG: ; %bb.0: @@ -1528,6 +1724,20 @@ define float @v_exp2_f32_ninf(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_ninf: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1542,6 +1752,20 @@ define float @v_exp2_f32_ninf(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_ninf: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_ninf: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1570,19 +1794,19 @@ define float @v_exp2_f32_afn(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_afn: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_afn: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_afn: ; VI-SDAG: ; %bb.0: @@ -1598,6 +1822,20 @@ define float @v_exp2_f32_afn(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_afn: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1612,6 +1850,20 @@ define float @v_exp2_f32_afn(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_afn: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_afn: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1660,19 +1912,19 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_afn_dynamic: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_afn_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_afn_dynamic: ; VI-SDAG: ; %bb.0: @@ -1688,6 +1940,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_afn_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_afn_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1702,6 +1968,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_afn_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_afn_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1730,19 +2010,19 @@ define float @v_fabs_exp2_f32_afn(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_fabs_exp2_f32_afn: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_fabs_exp2_f32_afn: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_fabs_exp2_f32_afn: ; VI-SDAG: ; %bb.0: @@ -1758,6 +2038,20 @@ define float @v_fabs_exp2_f32_afn(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_fabs_exp2_f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_fabs_exp2_f32_afn: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1772,6 +2066,20 @@ define float @v_fabs_exp2_f32_afn(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_fabs_exp2_f32_afn: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_fabs_exp2_f32_afn: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1821,19 +2129,19 @@ define float @v_exp2_f32_nnan(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_nnan: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_nnan: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_nnan: ; VI-SDAG: ; %bb.0: @@ -1849,6 +2157,20 @@ define float @v_exp2_f32_nnan(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_nnan: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_nnan: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1863,6 +2185,20 @@ define float @v_exp2_f32_nnan(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_nnan: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -1911,19 +2247,19 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_nnan_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_nnan_dynamic: ; VI-SDAG: ; %bb.0: @@ -1939,6 +2275,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_nnan_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1953,6 +2303,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_nnan_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -2001,19 +2365,19 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_ninf_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_ninf_dynamic: ; VI-SDAG: ; %bb.0: @@ -2029,6 +2393,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_ninf_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_ninf_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2043,6 +2421,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_ninf_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_ninf_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -2071,19 +2463,19 @@ define float @v_exp2_f32_nnan_ninf(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf: ; VI-SDAG: ; %bb.0: @@ -2099,6 +2491,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2113,6 +2519,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan_ninf: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -2161,19 +2581,19 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic: ; VI-SDAG: ; %bb.0: @@ -2189,6 +2609,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2203,6 +2637,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -2251,19 +2699,19 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_dynamic_mode: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_dynamic_mode: ; VI-SDAG: ; %bb.0: @@ -2279,6 +2727,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_dynamic_mode: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_dynamic_mode: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2293,6 +2755,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_dynamic_mode: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_dynamic_mode: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -2313,20 +2789,50 @@ define float @v_exp2_f32_undef() { ; GCN-SDAG-NEXT: v_exp_f32_e32 v0, 0x7fc00000 ; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_undef: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GCN-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; GCN-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_undef: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_exp2_f32_undef: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-GISEL-LABEL: v_exp2_f32_undef: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX900-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_undef: ; R600: ; %bb.0: @@ -3359,19 +3865,19 @@ define float @v_exp2_f32_contract(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_contract: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_contract: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_contract: ; VI-SDAG: ; %bb.0: @@ -3387,6 +3893,20 @@ define float @v_exp2_f32_contract(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_contract: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_contract: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3401,6 +3921,20 @@ define float @v_exp2_f32_contract(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_contract: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_contract: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -3449,19 +3983,19 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) { ; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GCN-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf: ; VI-SDAG: ; %bb.0: @@ -3477,6 +4011,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) { ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3491,6 +4039,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) { ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; R600-LABEL: v_exp2_f32_contract_nnan_ninf: ; R600: ; %bb.0: ; R600-NEXT: CF_END @@ -3518,3 +4080,5 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" } attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN-GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index b9fef0834cb2..88ef7a936393 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -3,11 +3,13 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s define { half, i32 } @test_frexp_f16_i32(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i32: @@ -50,6 +52,19 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -96,6 +111,16 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -145,6 +170,18 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) { ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -221,6 +258,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX11-NEXT: v_bfe_i32 v2, v4, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_frexp_mant_f16_e32 v3, v1 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v4, v1 +; GFX12-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_pack_b32_f16 v0, v2, v3 +; GFX12-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -311,6 +367,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) { ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_frexp_mant_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -386,6 +456,22 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) { ; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v1 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -463,6 +549,19 @@ define { half, i16 } @test_frexp_f16_i16(half %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -509,6 +608,16 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) { ; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -554,6 +663,16 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) { ; GFX11-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f16_i16_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -623,6 +742,19 @@ define { float, i32 } @test_frexp_f32_i32(float %a) { ; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -665,6 +797,16 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) { ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -706,6 +848,16 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) { ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f32_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -771,6 +923,21 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) { ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v5, v1 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -846,6 +1013,17 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) { ; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX12-NEXT: v_frexp_mant_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -896,6 +1074,17 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) { ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -954,6 +1143,19 @@ define { double, i32 } @test_frexp_f64_i32(double %a) { ; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[3:4], v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1000,6 +1202,16 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1044,6 +1256,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) { ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_f64_i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1116,6 +1338,22 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { ; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: test_frexp_v2f64_v2i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[8:9], v[0:1] +; GFX12-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v5, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX12-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32: ; GFX6-GISEL: ; %bb.0: ; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1174,6 +1412,17 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX11-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] +; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 @@ -1213,6 +1462,17 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) { ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_exp: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1] +; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1 ret <2 x i32> %result.1 @@ -1235,3 +1495,5 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ; GCN: {{.*}} ; GFX11-GISEL: {{.*}} ; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 218e41faa703..b850428a03c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 ; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 ; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 @@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log_f32: @@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2 -; SI-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6 -; SI-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7 -; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v1 +; SI-GISEL-NEXT: v_fma_f32 v6, v1, v2, -v5 +; SI-GISEL-NEXT: v_fma_f32 v6, v1, v3, v6 +; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s7, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; SI-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2 -; SI-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3 -; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v5 +; SI-GISEL-NEXT: v_fma_f32 v2, v5, v2, -v1 +; SI-GISEL-NEXT: v_fma_f32 v2, v5, v3, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 @@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 -; VI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; VI-GISEL-NEXT: v_and_b32_e32 v4, 0xfffff000, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s6, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v1 +; VI-GISEL-NEXT: v_sub_f32_e32 v4, v1, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v3 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 -; VI-GISEL-NEXT: v_log_f32_e32 v1, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s7, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, v1, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, v3, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v5 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 @@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s10, v2 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v3, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v4, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3 -; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1] -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s11, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v6 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v3, -v1 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v4, v3 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v5, 0x3377d1cf, v1 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5 ; GFX1100-GISEL-NEXT: v_fma_f32 v4, 0x3f317217, v0, -v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3 -; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v4, 0x3377d1cf, v0 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0 ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm @@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf -; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v0 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7 -; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v2, -v5 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, v6 +; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6 -; SI-GISEL-NEXT: v_log_f32_e32 v6, v6 -; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v5, s9, v5 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 -; SI-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9 -; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1] -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2 -; SI-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6 -; SI-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3 -; SI-GISEL-NEXT: v_add_f32_e32 v3, v6, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_fma_f32 v8, v5, v2, -v7 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_fma_f32 v8, v5, v3, v8 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s10, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; SI-GISEL-NEXT: v_log_f32_e32 v8, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v8 +; SI-GISEL-NEXT: v_fma_f32 v2, v8, v2, -v5 +; SI-GISEL-NEXT: v_fma_f32 v2, v8, v3, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v8|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 @@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v3 @@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s9, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; VI-GISEL-NEXT: v_ldexp_f32 v3, s9, v3 ; VI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41b17218 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, v3, v5 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7 -; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v6, s[2:3] -; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s10, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_log_f32_e32 v6, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3] +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v6, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v6 +; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v6 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, v6, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v5 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v6|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 @@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v2, -v6 ; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6 +; GFX900-GISEL-NEXT: v_ldexp_f32 v6, s9, v6 ; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v6 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v2, -v8 +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 +; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v1 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3 -; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v9 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v2, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v4, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v9|, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v8, 0x3377d1cf, v2 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v5, v5, v8 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v6, 0x3f317217, v0, -v3 +; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v2 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: v_fma_f32 v7, 0x3f317217, v1, -v4 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v6, 0x3377d1cf, v0 -; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 -; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v8, 0x3377d1cf, v2 :: v_dual_mov_b32 v3, 0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v10 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9 +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 @@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x3377d1cf -; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, -v1 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v4, v6 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v1 -; SI-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8 -; SI-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9 +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v1 +; SI-GISEL-NEXT: v_fma_f32 v8, v1, v3, -v7 +; SI-GISEL-NEXT: v_fma_f32 v8, v1, v4, v8 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9 -; SI-GISEL-NEXT: v_log_f32_e32 v9, v9 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1] +; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 5, v8 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v8, s10, v8 +; SI-GISEL-NEXT: v_log_f32_e32 v8, v8 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v9 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; SI-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8 -; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10 -; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v10 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3 -; SI-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8 -; SI-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4 -; SI-GISEL-NEXT: v_add_f32_e32 v4, v8, v4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v8 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_fma_f32 v9, v8, v3, -v7 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-GISEL-NEXT: v_fma_f32 v9, v8, v4, v9 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s11, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v9 +; SI-GISEL-NEXT: v_log_f32_e32 v9, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v8|, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v7, s[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v9 +; SI-GISEL-NEXT: v_fma_f32 v3, v9, v3, -v7 +; SI-GISEL-NEXT: v_fma_f32 v3, v9, v4, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v3, v7, v3 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 ; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v1 @@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s9, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v7, v1, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7 -; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41b17218 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v1 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, v1, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s10, v7 -; VI-GISEL-NEXT: v_log_f32_e32 v7, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v5, s[0:1] -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6 +; VI-GISEL-NEXT: v_ldexp_f32 v6, s10, v6 +; VI-GISEL-NEXT: v_log_f32_e32 v6, v6 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v6 +; VI-GISEL-NEXT: v_sub_f32_e32 v7, v6, v5 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v8, v7, v6 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v10, 0x3805fdf4, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v9, v10, v9 -; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317000, v8 -; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v6, s[2:3] -; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3 -; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 +; VI-GISEL-NEXT: v_ldexp_f32 v2, s11, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7 +; VI-GISEL-NEXT: v_log_f32_e32 v7, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v5, s[2:3] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v7 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, v7, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3377d1cf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v1 ; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8 +; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v3, -v8 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v9, 5, v9 +; GFX900-GISEL-NEXT: v_ldexp_f32 v9, s10, v9 ; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v9 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v9 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8 -; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v3, -v8 +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10 +; GFX900-GISEL-NEXT: v_ldexp_f32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v10 +; GFX900-GISEL-NEXT: v_log_f32_e32 v10, v2 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8 -; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4 -; GFX900-GISEL-NEXT: v_add_f32_e32 v4, v8, v4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] -; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v10 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v3, -v8 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v5, v3 +; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v8, v3 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, v6 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v7, s[0:1] +; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v5 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v2 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v7, v7, v12 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fma_f32 v10, 0x3f317217, v0, -v5 ; GFX1100-GISEL-NEXT: v_fma_f32 v11, 0x3f317217, v1, -v6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7 -; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3 ; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| -; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b128 v5, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log_v4f32: @@ -2126,10 +2165,10 @@ define float @v_log_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2175,16 +2214,16 @@ define float @v_log_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -2224,10 +2263,10 @@ define float @v_log_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2270,21 +2309,22 @@ define float @v_log_f32(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2329,10 +2369,10 @@ define float @v_log_fabs_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2378,16 +2418,16 @@ define float @v_log_fabs_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -2427,10 +2467,10 @@ define float @v_log_fabs_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2475,20 +2515,22 @@ define float @v_log_fabs_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2534,10 +2576,10 @@ define float @v_log_fneg_fabs_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2583,16 +2625,16 @@ define float @v_log_fneg_fabs_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -2632,10 +2674,10 @@ define float @v_log_fneg_fabs_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2680,20 +2722,22 @@ define float @v_log_fneg_fabs_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0| ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2740,10 +2784,10 @@ define float @v_log_fneg_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2789,16 +2833,16 @@ define float @v_log_fneg_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -2838,10 +2882,10 @@ define float @v_log_fneg_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -2885,20 +2929,22 @@ define float @v_log_fneg_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3304,10 +3350,10 @@ define float @v_log_f32_ninf(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -3353,16 +3399,16 @@ define float @v_log_f32_ninf(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -3402,10 +3448,10 @@ define float @v_log_f32_ninf(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -3448,21 +3494,22 @@ define float @v_log_f32_ninf(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4038,10 +4085,10 @@ define float @v_log_f32_nnan(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -4087,16 +4134,16 @@ define float @v_log_f32_nnan(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -4136,10 +4183,10 @@ define float @v_log_f32_nnan(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -4182,21 +4229,22 @@ define float @v_log_f32_nnan(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4381,10 +4429,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -4430,16 +4478,16 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -4479,10 +4527,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -4525,21 +4573,22 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4724,10 +4773,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -4773,16 +4822,16 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -4822,10 +4871,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -4868,21 +4917,22 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4924,10 +4974,10 @@ define float @v_log_f32_nnan_ninf(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -4967,16 +5017,16 @@ define float @v_log_f32_nnan_ninf(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317000, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -5010,10 +5060,10 @@ define float @v_log_f32_nnan_ninf(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -5051,18 +5101,20 @@ define float @v_log_f32_nnan_ninf(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -5207,10 +5259,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -5250,16 +5302,16 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317000, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -5293,10 +5345,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -5334,18 +5386,20 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -5419,10 +5473,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -5468,16 +5522,16 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 @@ -5517,10 +5571,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf @@ -5563,21 +5617,22 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index fd50d1b60fbd..d09df7583733 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 ; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 ; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 @@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x411a209b +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log10_f32: @@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2 -; SI-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6 -; SI-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7 -; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v1 +; SI-GISEL-NEXT: v_fma_f32 v6, v1, v2, -v5 +; SI-GISEL-NEXT: v_fma_f32 v6, v1, v3, v6 +; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s7, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; SI-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2 -; SI-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3 -; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v5 +; SI-GISEL-NEXT: v_fma_f32 v2, v5, v2, -v1 +; SI-GISEL-NEXT: v_fma_f32 v2, v5, v3, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 @@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2 -; VI-GISEL-NEXT: v_log_f32_e32 v2, v2 -; VI-GISEL-NEXT: v_and_b32_e32 v4, 0xfffff000, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, v2, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s6, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 +; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v1 +; VI-GISEL-NEXT: v_sub_f32_e32 v4, v1, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v3 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 -; VI-GISEL-NEXT: v_log_f32_e32 v1, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s7, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, v1, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, v3, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v7, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v5 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v5 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 @@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s10, v2 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v1 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v3, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v4, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3 -; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1] -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s11, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v6 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v3, -v1 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v4, v3 +; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v5, 0x3284fbcf, v1 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5 ; GFX1100-GISEL-NEXT: v_fma_f32 v4, 0x3e9a209a, v0, -v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3 -; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v4, 0x3284fbcf, v0 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0 ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm @@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf -; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v0 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7 -; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v2, -v5 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, v6 +; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6 -; SI-GISEL-NEXT: v_log_f32_e32 v6, v6 -; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v5, s9, v5 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b +; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 -; SI-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9 -; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1] -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2 -; SI-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6 -; SI-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3 -; SI-GISEL-NEXT: v_add_f32_e32 v3, v6, v3 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_fma_f32 v8, v5, v2, -v7 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_fma_f32 v8, v5, v3, v8 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s10, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; SI-GISEL-NEXT: v_log_f32_e32 v8, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v8 +; SI-GISEL-NEXT: v_fma_f32 v2, v8, v2, -v5 +; SI-GISEL-NEXT: v_fma_f32 v2, v8, v3, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v8|, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 @@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v3 @@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s9, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; VI-GISEL-NEXT: v_ldexp_f32 v3, s9, v3 ; VI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x411a209b -; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x411a209b +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v3 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, v3, v5 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7 -; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v6, s[2:3] -; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s10, v1 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_log_f32_e32 v6, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3] +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v6, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v6 +; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v6 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, v6, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v5 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v6|, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 @@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v2, -v6 ; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6 +; GFX900-GISEL-NEXT: v_ldexp_f32 v6, s9, v6 ; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v6 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v2, -v8 +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 +; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v1 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6 -; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3 -; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v6, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v9 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v2, -v6 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v4, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v9|, v5 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v8, 0x3284fbcf, v2 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v5, v5, v8 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v6, 0x3e9a209a, v0, -v3 +; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v2 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: v_fma_f32 v7, 0x3e9a209a, v1, -v4 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v6, 0x3284fbcf, v0 -; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 -; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v8, 0x3284fbcf, v2 :: v_dual_mov_b32 v3, 0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v10 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9 +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 @@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a +; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x3284fbcf -; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1 -; SI-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, -v1 +; SI-GISEL-NEXT: v_fma_f32 v6, v0, v4, v6 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v1 -; SI-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8 -; SI-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9 +; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b +; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v1 +; SI-GISEL-NEXT: v_fma_f32 v8, v1, v3, -v7 +; SI-GISEL-NEXT: v_fma_f32 v8, v1, v4, v8 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9 -; SI-GISEL-NEXT: v_log_f32_e32 v9, v9 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1] +; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 5, v8 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v8, s10, v8 +; SI-GISEL-NEXT: v_log_f32_e32 v8, v8 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v9 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; SI-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8 -; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10 -; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v10 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3] -; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 -; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3 -; SI-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8 -; SI-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4 -; SI-GISEL-NEXT: v_add_f32_e32 v4, v8, v4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v8 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_fma_f32 v9, v8, v3, -v7 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-GISEL-NEXT: v_fma_f32 v9, v8, v4, v9 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s11, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v9 +; SI-GISEL-NEXT: v_log_f32_e32 v9, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v8|, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v7, s[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v7 +; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v9 +; SI-GISEL-NEXT: v_fma_f32 v3, v9, v3, -v7 +; SI-GISEL-NEXT: v_fma_f32 v3, v9, v4, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v3, v7, v3 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, v5 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 ; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v1 @@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s9, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x411a209b -; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v7, v1, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7 -; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x411a209b +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v1 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, v1, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v7, s10, v7 -; VI-GISEL-NEXT: v_log_f32_e32 v7, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v5, s[0:1] -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6 +; VI-GISEL-NEXT: v_ldexp_f32 v6, s10, v6 +; VI-GISEL-NEXT: v_log_f32_e32 v6, v6 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v6 +; VI-GISEL-NEXT: v_sub_f32_e32 v7, v6, v5 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v8, v7, v6 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v8 -; VI-GISEL-NEXT: v_mul_f32_e32 v10, 0x369a84fb, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v9, v10, v9 -; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a2000, v8 -; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v6, s[2:3] -; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 -; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3 -; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v7 -; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v5 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 +; VI-GISEL-NEXT: v_ldexp_f32 v2, s11, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7 +; VI-GISEL-NEXT: v_log_f32_e32 v7, v2 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v5, s[2:3] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 +; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v7 +; VI-GISEL-NEXT: v_sub_f32_e32 v6, v7, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6 +; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 -; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1] +; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5 +; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, v3 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3284fbcf ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1 +; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v1 ; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7 ; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v7 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v1 -; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8 +; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v3, -v8 ; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v9, 5, v9 +; GFX900-GISEL-NEXT: v_ldexp_f32 v9, s10, v9 ; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v9 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v9 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8 -; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v3, -v8 +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10 +; GFX900-GISEL-NEXT: v_ldexp_f32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v10 +; GFX900-GISEL-NEXT: v_log_f32_e32 v10, v2 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3] ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8 -; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4 -; GFX900-GISEL-NEXT: v_add_f32_e32 v4, v8, v4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] -; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v10 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v3, -v8 +; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v5, v3 +; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v8, v3 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, v6 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v7, s[0:1] +; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v5 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v3, s3, v3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v2 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3 +; GFX1100-GISEL-NEXT: v_add_f32_e32 v7, v7, v12 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fma_f32 v10, 0x3e9a209a, v0, -v5 ; GFX1100-GISEL-NEXT: v_fma_f32 v11, 0x3e9a209a, v1, -v6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7 -; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3 ; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2| -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3| -; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b128 v5, v[0:3], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log10_v4f32: @@ -2126,10 +2165,10 @@ define float @v_log10_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2175,16 +2214,16 @@ define float @v_log10_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -2224,10 +2263,10 @@ define float @v_log10_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2270,21 +2309,22 @@ define float @v_log10_f32(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2329,10 +2369,10 @@ define float @v_log10_fabs_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2378,16 +2418,16 @@ define float @v_log10_fabs_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -2427,10 +2467,10 @@ define float @v_log10_fabs_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2475,20 +2515,22 @@ define float @v_log10_fabs_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2534,10 +2576,10 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2583,16 +2625,16 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -2632,10 +2674,10 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2680,20 +2722,22 @@ define float @v_log10_fneg_fabs_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0| ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2740,10 +2784,10 @@ define float @v_log10_fneg_f32(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2789,16 +2833,16 @@ define float @v_log10_fneg_f32(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -2838,10 +2882,10 @@ define float @v_log10_fneg_f32(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -2885,20 +2929,22 @@ define float @v_log10_fneg_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3304,10 +3350,10 @@ define float @v_log10_f32_ninf(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -3353,16 +3399,16 @@ define float @v_log10_f32_ninf(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -3402,10 +3448,10 @@ define float @v_log10_f32_ninf(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -3448,21 +3494,22 @@ define float @v_log10_f32_ninf(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4038,10 +4085,10 @@ define float @v_log10_f32_nnan(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -4087,16 +4134,16 @@ define float @v_log10_f32_nnan(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -4136,10 +4183,10 @@ define float @v_log10_f32_nnan(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -4182,21 +4229,22 @@ define float @v_log10_f32_nnan(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4381,10 +4429,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -4430,16 +4478,16 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -4479,10 +4527,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -4525,21 +4573,22 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4724,10 +4773,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -4773,16 +4822,16 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -4822,10 +4871,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -4868,21 +4917,22 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4924,10 +4974,10 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -4967,16 +5017,16 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a2000, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -5010,10 +5060,10 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -5051,18 +5101,20 @@ define float @v_log10_f32_nnan_ninf(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -5207,10 +5259,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -5250,16 +5302,16 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a2000, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -5293,10 +5345,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -5334,18 +5386,20 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -5419,10 +5473,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -5468,16 +5522,16 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 @@ -5517,10 +5571,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf @@ -5563,21 +5617,22 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 2c5a9f58a199..8b3b79b0b1bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -36,14 +36,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0xb ; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -74,13 +74,13 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -108,20 +108,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-GISEL-LABEL: s_log2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s2, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -147,20 +146,22 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log2_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_f32: @@ -242,21 +243,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v3, s6, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_log_f32_e32 v1, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v3, v0 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s6, v2 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s7, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v3, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1 ; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -291,21 +293,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s6, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0 -; VI-GISEL-NEXT: v_log_f32_e32 v3, v3 -; VI-GISEL-NEXT: v_log_f32_e32 v1, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v3, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v2, s6, v2 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s7, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v2, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -339,22 +342,23 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL: ; %bb.0: ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, s10, v3 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v3, s10, v3 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s11, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 ; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -387,23 +391,28 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s5 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX1100-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v2f32: @@ -506,32 +515,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 -; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_ldexp_f32_e32 v3, s9, v3 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s10, v1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] -; SI-GISEL-NEXT: v_log_f32_e32 v4, v4 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 -; SI-GISEL-NEXT: v_log_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; SI-GISEL-NEXT: v_log_f32_e32 v4, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 -; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2 ; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 ; SI-GISEL-NEXT: s_endpgm @@ -571,32 +582,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v3, s9, v3 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s10, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 -; VI-GISEL-NEXT: v_log_f32_e32 v4, v4 -; VI-GISEL-NEXT: v_log_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; VI-GISEL-NEXT: v_log_f32_e32 v3, v3 +; VI-GISEL-NEXT: v_log_f32_e32 v4, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm ; @@ -637,28 +650,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; GFX900-GISEL-NEXT: v_ldexp_f32 v4, s1, v4 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s2, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v4 -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -702,33 +717,40 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v3f32: ; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v6, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s6 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s3 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2 +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b96 v6, v[0:2], s[4:5] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v3f32: @@ -865,34 +887,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v3, s[0:1] ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 -; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 -; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v4, s10, v4 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s11, v2 +; SI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; SI-GISEL-NEXT: v_log_f32_e32 v5, v2 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; SI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2 +; SI-GISEL-NEXT: v_sub_f32_e32 v3, v5, v3 ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 ; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm @@ -942,33 +967,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v1, s9, v1 ; VI-GISEL-NEXT: v_log_f32_e32 v1, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v3, s[0:1] ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; VI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 -; VI-GISEL-NEXT: v_log_f32_e32 v5, v5 -; VI-GISEL-NEXT: v_log_f32_e32 v3, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 -; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; VI-GISEL-NEXT: v_ldexp_f32 v4, s10, v4 +; VI-GISEL-NEXT: v_ldexp_f32 v2, s11, v2 +; VI-GISEL-NEXT: v_log_f32_e32 v4, v4 +; VI-GISEL-NEXT: v_log_f32_e32 v5, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v5, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1018,34 +1046,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34 ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0 +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s9, v1 ; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX900-GISEL-NEXT: v_ldexp_f32 v5, s10, v5 +; GFX900-GISEL-NEXT: v_ldexp_f32 v2, s11, v2 ; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5 -; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v2 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1] ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v6, v3 ; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -1095,39 +1126,46 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v4f32: ; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_clause 0x1 ; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s6 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v3, 5, v3 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v3, s3, v3 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1100-GISEL-NEXT: v_dual_sub_f32 v3, v3, v7 :: v_dual_lshlrev_b32 v2, 5, v2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX1100-GISEL-NEXT: global_store_b128 v8, v[0:3], s[4:5] ; GFX1100-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_log2_v4f32: @@ -1243,19 +1281,19 @@ define float @v_log2_f32(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32: ; VI-SDAG: ; %bb.0: @@ -1271,6 +1309,20 @@ define float @v_log2_f32(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1285,6 +1337,20 @@ define float @v_log2_f32(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1304,10 +1370,12 @@ define float @v_log2_f32(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1341,19 +1409,19 @@ define float @v_log2_fabs_f32(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_fabs_f32: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_fabs_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fabs_f32: ; VI-SDAG: ; %bb.0: @@ -1369,6 +1437,20 @@ define float @v_log2_fabs_f32(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_fabs_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_fabs_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1383,6 +1465,20 @@ define float @v_log2_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_fabs_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_fabs_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1403,10 +1499,11 @@ define float @v_log2_fabs_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1441,19 +1538,19 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_fneg_fabs_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fneg_fabs_f32: ; VI-SDAG: ; %bb.0: @@ -1469,6 +1566,20 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_fneg_fabs_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1483,6 +1594,20 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_fneg_fabs_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1503,10 +1628,11 @@ define float @v_log2_fneg_fabs_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0| ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1542,19 +1668,19 @@ define float @v_log2_fneg_f32(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_fneg_f32: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_fneg_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_fneg_f32: ; VI-SDAG: ; %bb.0: @@ -1570,6 +1696,20 @@ define float @v_log2_fneg_f32(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_fneg_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_fneg_f32: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1584,6 +1724,20 @@ define float @v_log2_fneg_f32(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_fneg_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_fneg_f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1604,10 +1758,11 @@ define float @v_log2_fneg_f32(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1642,19 +1797,19 @@ define float @v_log2_f32_fast(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_fast: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_fast: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_fast: ; VI-SDAG: ; %bb.0: @@ -1670,6 +1825,20 @@ define float @v_log2_f32_fast(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_fast: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_fast: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1684,6 +1853,20 @@ define float @v_log2_f32_fast(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_fast: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_fast: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1703,10 +1886,12 @@ define float @v_log2_f32_fast(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1740,19 +1925,19 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_unsafe_math_attr: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_unsafe_math_attr: ; VI-SDAG: ; %bb.0: @@ -1768,6 +1953,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_unsafe_math_attr: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_unsafe_math_attr: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1782,6 +1981,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_unsafe_math_attr: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1801,10 +2014,12 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1838,19 +2053,19 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_approx_fn_attr: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_approx_fn_attr: ; VI-SDAG: ; %bb.0: @@ -1866,6 +2081,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_approx_fn_attr: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_approx_fn_attr: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1880,6 +2109,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_approx_fn_attr: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1899,10 +2142,12 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1936,19 +2181,19 @@ define float @v_log2_f32_ninf(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_ninf: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_ninf: ; VI-SDAG: ; %bb.0: @@ -1964,6 +2209,20 @@ define float @v_log2_f32_ninf(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_ninf: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1978,6 +2237,20 @@ define float @v_log2_f32_ninf(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_ninf: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_ninf: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1997,10 +2270,12 @@ define float @v_log2_f32_ninf(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2034,19 +2309,19 @@ define float @v_log2_f32_afn(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_afn: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_afn: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_afn: ; VI-SDAG: ; %bb.0: @@ -2062,6 +2337,20 @@ define float @v_log2_f32_afn(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_afn: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2076,6 +2365,20 @@ define float @v_log2_f32_afn(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_afn: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_afn: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2095,10 +2398,12 @@ define float @v_log2_f32_afn(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2158,19 +2463,19 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_afn_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_afn_dynamic: ; VI-SDAG: ; %bb.0: @@ -2186,6 +2491,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_afn_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_afn_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2200,6 +2519,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_afn_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2219,10 +2552,12 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2256,19 +2591,19 @@ define float @v_fabs_log2_f32_afn(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_fabs_log2_f32_afn: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_fabs_log2_f32_afn: ; VI-SDAG: ; %bb.0: @@ -2284,6 +2619,20 @@ define float @v_fabs_log2_f32_afn(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_fabs_log2_f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_fabs_log2_f32_afn: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2298,6 +2647,20 @@ define float @v_fabs_log2_f32_afn(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_fabs_log2_f32_afn: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2318,10 +2681,11 @@ define float @v_fabs_log2_f32_afn(float %in) { ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2382,19 +2746,19 @@ define float @v_log2_f32_nnan(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_nnan: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_nnan: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_nnan: ; VI-SDAG: ; %bb.0: @@ -2410,6 +2774,20 @@ define float @v_log2_f32_nnan(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_nnan: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_nnan: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2424,6 +2802,20 @@ define float @v_log2_f32_nnan(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_nnan: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2443,10 +2835,12 @@ define float @v_log2_f32_nnan(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2506,19 +2900,19 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_nnan_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_nnan_dynamic: ; VI-SDAG: ; %bb.0: @@ -2534,6 +2928,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_nnan_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_nnan_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2548,6 +2956,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_nnan_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2567,10 +2989,12 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2630,19 +3054,19 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_ninf_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_ninf_dynamic: ; VI-SDAG: ; %bb.0: @@ -2658,6 +3082,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_ninf_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_ninf_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2672,6 +3110,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_ninf_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2691,10 +3143,12 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2728,19 +3182,19 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf: ; VI-SDAG: ; %bb.0: @@ -2756,6 +3210,20 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_nnan_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2770,6 +3238,20 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2789,10 +3271,12 @@ define float @v_log2_f32_nnan_ninf(float %in) { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2852,19 +3336,19 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: ; VI-SDAG: ; %bb.0: @@ -2880,6 +3364,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2894,6 +3392,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2913,10 +3425,12 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -2976,19 +3490,19 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-GISEL-LABEL: v_log2_f32_dynamic_mode: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_log2_f32_dynamic_mode: ; VI-SDAG: ; %bb.0: @@ -3004,6 +3518,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; VI-GISEL-LABEL: v_log2_f32_dynamic_mode: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX900-SDAG-LABEL: v_log2_f32_dynamic_mode: ; GFX900-SDAG: ; %bb.0: ; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3018,6 +3546,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] ; +; GFX900-GISEL-LABEL: v_log2_f32_dynamic_mode: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3037,10 +3579,12 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 4de0c548ad38..795ed6d542a1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -3,6 +3,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s declare half @llvm.rint.f16(half %a) declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) @@ -63,6 +64,24 @@ define amdgpu_kernel void @rint_f16( ; GFX11-NEXT: v_rndne_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rint_f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_rndne_f16_e32 v0, v0 +; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -168,6 +187,28 @@ define amdgpu_kernel void @rint_v2f16( ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rint_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_rndne_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_rndne_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index 2bb89fdabda7..6927636ad04a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: @@ -80,6 +81,19 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_sin_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sin_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.sin.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -188,6 +202,24 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sin_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-NEXT: v_sin_f16_e32 v2, v2 +; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 47777e3853e8..0d58afd1812d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) @@ -62,6 +63,24 @@ define amdgpu_kernel void @trunc_f16( ; GFX11-NEXT: v_trunc_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: trunc_f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -147,6 +166,28 @@ define amdgpu_kernel void @trunc_v2f16( ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: trunc_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-NEXT: s_mov_b32 s10, s6 +; GFX12-NEXT: s_mov_b32 s11, s7 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s8, s2 +; GFX12-NEXT: s_mov_b32 s9, s3 +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-NEXT: s_mov_b32 s5, s1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_trunc_f16_e32 v1, v1 +; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 33007e5b285d..3be17f9538d0 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1333,5 +1333,668 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 { ret i48 %a } +define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 { +; CI-LABEL: lshr_mad_i64_1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xfc19 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_1: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xfc19 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v3, v1, s4 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xfc19 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xfffffffffffffc19 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_2(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_2: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xd1 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_2: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xd1 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v3, v1, s4 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xd1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xffffffff000000d1 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_3(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_3: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xfc88 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_3: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xfc88 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v3, v1, s4 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xfc88 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 s0xfffffffffffffc88, %lsh + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { +; CI-LABEL: lshr_mad_i64_4: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_lo_u32 v3, v2, v0 +; CI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0 +; CI-NEXT: s_movk_i32 s4, 0xfc88 +; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_4: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_lo_u32 v2, v2, v0 +; SI-NEXT: v_mul_hi_u32 v3, v1, v0 +; SI-NEXT: s_movk_i32 s4, 0xfc88 +; SI-NEXT: v_mul_lo_u32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; SI-NEXT: v_mul_hi_u32 v3, v2, s4 +; SI-NEXT: v_mul_lo_u32 v1, v2, s4 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-NEXT: s_movk_i32 s4, 0xfc88 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5] +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %ext = zext i32 %arg0 to i64 + %mul1 = mul i64 %arg1, %ext + %lsh = lshr i64 %mul1, 32 + %mul2 = mul i64 %lsh, s0xfffffffffffffc88 + %mad = add i64 %mul2, %mul1 + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; CI-NEXT: s_movk_i32 s4, 0xfc19 +; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1] +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_1: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; SI-NEXT: s_movk_i32 s4, 0xfc19 +; SI-NEXT: v_mul_lo_u32 v3, v2, s4 +; SI-NEXT: v_mul_hi_i32 v2, v2, s4 +; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GFX9-NEXT: s_movk_i32 s4, 0xfc19 +; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: lshr_mad_i64_negative_1: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1] +; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1150-LABEL: lshr_mad_i64_negative_1: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1] +; GFX1150-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 36 + %mul = mul i64 %lsh, s0xfffffffffffffc19 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_2: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_movk_i32 s4, 0xd1 +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v0 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_2: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xd1 +; SI-NEXT: v_mul_hi_u32 v2, v1, s4 +; SI-NEXT: v_mul_lo_u32 v4, v1, s4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v1 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0xd1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_negative_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xffffff00000000d1 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_3: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22 +; CI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; CI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_3: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22 +; SI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_negative_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %op = add i64 %arg0, 1 + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xfffffffffffffc00 + %mad = add i64 %mul, %op + + ret i64 %mad +} + +define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { +; CI-LABEL: lshr_mad_i64_negative_4: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1] +; CI-NEXT: v_mul_lo_u32 v0, v1, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v0, v3 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_negative_4: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_hi_u32 v2, v1, v0 +; SI-NEXT: v_mul_lo_u32 v3, v1, v1 +; SI-NEXT: v_mul_lo_u32 v4, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_negative_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: lshr_mad_i64_negative_4: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1150-LABEL: lshr_mad_i64_negative_4: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mov_b32_e32 v0, v4 +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, v3 +; GFX1150-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_negative_4: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, %arg0 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 { +; CI-LABEL: lshr_mad_i64_sgpr: +; CI: ; %bb.0: +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v2, 0xffff1c18 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1] +; CI-NEXT: v_subrev_i32_e32 v1, vcc, s1, v1 +; CI-NEXT: v_readfirstlane_b32 s0, v0 +; CI-NEXT: v_readfirstlane_b32 s1, v1 +; CI-NEXT: ; return to shader part epilog +; +; SI-LABEL: lshr_mad_i64_sgpr: +; SI: ; %bb.0: +; SI-NEXT: v_mov_b32_e32 v0, 0xffff1c18 +; SI-NEXT: v_mul_hi_u32 v0, s1, v0 +; SI-NEXT: s_mul_i32 s2, s1, 0xffff1c18 +; SI-NEXT: v_readfirstlane_b32 s3, v0 +; SI-NEXT: s_sub_i32 s3, s3, s1 +; SI-NEXT: s_add_u32 s0, s2, s0 +; SI-NEXT: s_addc_u32 s1, s3, s1 +; SI-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: lshr_mad_i64_sgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18 +; GFX9-NEXT: s_sub_i32 s2, s2, s1 +; GFX9-NEXT: s_mul_i32 s3, s1, 0xffff1c18 +; GFX9-NEXT: s_add_u32 s0, s3, s0 +; GFX9-NEXT: s_addc_u32 s1, s2, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: lshr_mad_i64_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18 +; GFX11-NEXT: s_mul_i32 s3, s1, 0xffff1c18 +; GFX11-NEXT: s_sub_i32 s2, s2, s1 +; GFX11-NEXT: s_add_u32 s0, s3, s0 +; GFX11-NEXT: s_addc_u32 s1, s2, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: lshr_mad_i64_sgpr: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, 0xffff1c18 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_mov_b32 s5, -1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-NEXT: ; return to shader part epilog + %lsh = lshr i64 %arg0, 32 + %mul = mul i64 %lsh, s0xffffffffffff1c18 + %mad = add i64 %mul, %arg0 + + ret i64 %mad +} + +define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 { +; CI-LABEL: lshr_mad_i64_vec: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, 0xffff1c18 +; CI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1] +; CI-NEXT: s_mov_b32 s4, 0xffff1118 +; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3] +; CI-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 +; CI-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 +; CI-NEXT: v_mov_b32_e32 v0, v4 +; CI-NEXT: v_mov_b32_e32 v2, v6 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: lshr_mad_i64_vec: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffff1118 +; SI-NEXT: v_mul_lo_u32 v4, v3, s4 +; SI-NEXT: v_mul_hi_u32 v5, v3, s4 +; SI-NEXT: s_mov_b32 s4, 0xffff1c18 +; SI-NEXT: v_mul_hi_u32 v6, v1, s4 +; SI-NEXT: v_mul_lo_u32 v7, v1, s4 +; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lshr_mad_i64_vec: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xffff1c18 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1] +; GFX9-NEXT: s_mov_b32 s4, 0xffff1118 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3] +; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: lshr_mad_i64_vec: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, v7, v3 +; GFX11-NEXT: v_mov_b32_e32 v2, v6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lshr_mad_i64_vec: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3 +; GFX12-NEXT: v_mov_b32_e32 v2, v6 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32> + %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118> + %mad = add <2 x i64> %mul, %arg0 + + ret <2 x i64> %mad +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 5e46fd6b28d2..fa15a42aef2a 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -1838,11 +1838,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_v3f16: @@ -1904,8 +1904,8 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_v3f16_nnan: @@ -1947,20 +1947,20 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX8-LABEL: v_maximumnum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_v4f16: @@ -2020,12 +2020,12 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX8-LABEL: v_maximumnum_v4f16_nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_v4f16_nnan: @@ -2067,27 +2067,27 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX8-LABEL: v_maximumnum_v6f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v2, v5 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_v6f16: @@ -2159,34 +2159,34 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX8-LABEL: v_maximumnum_v8f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximumnum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 9e0b7daf38de..f5fb85d63b8e 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -1792,11 +1792,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_min_f16_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_v3f16: @@ -1858,8 +1858,8 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_v3f16_nnan: @@ -1901,20 +1901,20 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX8-LABEL: v_minimumnum_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_v4f16: @@ -1974,12 +1974,12 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) { ; GFX8-LABEL: v_minimumnum_v4f16_nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_min_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_v4f16_nnan: @@ -2021,27 +2021,27 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX8-LABEL: v_minimumnum_v6f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v2, v2, v5 ; GFX8-NEXT: v_min_f16_e32 v1, v1, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v4, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_v6f16: @@ -2113,34 +2113,34 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX8-LABEL: v_minimumnum_v8f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 ; GFX8-NEXT: v_min_f16_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v7, v7 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimumnum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 73f3d4c037ad..774a22fb907d 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-LABEL: test_minmax_i32: @@ -8,6 +10,16 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c) ret i32 %sminmax @@ -45,6 +57,16 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_commuted_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax) ret i32 %sminmax @@ -56,6 +78,16 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c) ret i32 %smaxmin @@ -67,6 +99,16 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_commuted_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin) ret i32 %smaxmin @@ -79,6 +121,17 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) { ; GFX11-NEXT: v_med3_i32 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_smed3_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_i32 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y) %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y) %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z) @@ -93,6 +146,16 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c) ret i32 %uminmax @@ -130,6 +193,16 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_commuted_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax) ret i32 %uminmax @@ -141,6 +214,16 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c) ret i32 %umaxmin @@ -152,6 +235,16 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_commuted_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin) ret i32 %umaxmin @@ -164,6 +257,17 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) { ; GFX11-NEXT: v_med3_u32 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_umed3_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_u32 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y) %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y) %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z) @@ -173,44 +277,88 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) { } define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) { -; SDAG-LABEL: test_minmax_f32_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; SDAG-NEXT: v_max_f32_e32 v2, v2, v2 -; SDAG-NEXT: v_maxmin_f32 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_minmax_f32_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_minmax_f32_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 -; GISEL-NEXT: v_max_f32_e32 v2, v2, v2 -; GISEL-NEXT: v_maxmin_f32 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-LABEL: test_minmax_f32_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) %minmax = call float @llvm.minnum.f32(float %max, float %c) ret float %minmax } define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-LABEL: s_test_minmax_f32_ieee_false: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; SDAG-NEXT: s_mov_b32 s5, s4 -; SDAG-NEXT: s_mov_b32 s4, s3 -; SDAG-NEXT: v_maxmin_f32 v0, s0, s1, v0 -; SDAG-NEXT: global_store_b32 v1, v0, s[4:5] -; SDAG-NEXT: s_endpgm +; SDAG-GFX11-LABEL: s_test_minmax_f32_ieee_false: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX11-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-NEXT: v_maxmin_f32 v0, s0, s1, v0 +; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; SDAG-GFX11-NEXT: s_endpgm ; -; GISEL-LABEL: s_test_minmax_f32_ieee_false: -; GISEL: ; %bb.0: -; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; GISEL-NEXT: s_mov_b32 s6, s3 -; GISEL-NEXT: s_mov_b32 s7, s4 -; GISEL-NEXT: v_maxmin_f32 v0, s0, s1, v0 -; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_endpgm +; GISEL-GFX11-LABEL: s_test_minmax_f32_ieee_false: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX11-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-NEXT: v_maxmin_f32 v0, s0, s1, v0 +; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX11-NEXT: s_endpgm +; +; SDAG-GFX12-LABEL: s_test_minmax_f32_ieee_false: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX12-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, s0, s1, v0 +; SDAG-GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; SDAG-GFX12-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: s_test_minmax_f32_ieee_false: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_max_num_f32 s0, s0, s1 +; GISEL-GFX12-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX12-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_min_num_f32 s0, s0, s2 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX12-NEXT: s_endpgm %smax = call float @llvm.maxnum.f32(float %a, float %b) %sminmax = call float @llvm.minnum.f32(float %smax, float %c) store float %sminmax, ptr addrspace(1) %out @@ -222,27 +370,56 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b, ; GFX11: ; %bb.0: ; GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_minmax_commuted_f32_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) %minmax = call float @llvm.minnum.f32(float %c, float %max) ret float %minmax } define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) { -; SDAG-LABEL: test_maxmin_f32_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; SDAG-NEXT: v_max_f32_e32 v2, v2, v2 -; SDAG-NEXT: v_minmax_f32 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_maxmin_f32_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_maxmin_f32_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 -; GISEL-NEXT: v_max_f32_e32 v2, v2, v2 -; GISEL-NEXT: v_minmax_f32 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-LABEL: test_maxmin_f32_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) %maxmin = call float @llvm.maxnum.f32(float %min, float %c) ret float %maxmin @@ -253,6 +430,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b, ; GFX11: ; %bb.0: ; GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_maxmin_commuted_f32_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) %maxmin = call float @llvm.maxnum.f32(float %c, float %min) ret float %maxmin @@ -265,6 +447,17 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) ; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_med3_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call float @llvm.minnum.f32(float %x, float %y) %tmp1 = call float @llvm.maxnum.f32(float %x, float %y) %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z) @@ -278,29 +471,54 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_minmax_f16_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) %minmax = call half @llvm.minnum.f16(half %max, half %c) ret half %minmax } define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-LABEL: s_test_minmax_f16_ieee_false: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; SDAG-NEXT: s_mov_b32 s5, s4 -; SDAG-NEXT: s_mov_b32 s4, s3 -; SDAG-NEXT: v_maxmin_f16 v0, s0, s1, v0 -; SDAG-NEXT: global_store_b16 v1, v0, s[4:5] -; SDAG-NEXT: s_endpgm +; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX11-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0 +; SDAG-GFX11-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX11-NEXT: s_endpgm ; -; GISEL-LABEL: s_test_minmax_f16_ieee_false: -; GISEL: ; %bb.0: -; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; GISEL-NEXT: s_mov_b32 s6, s3 -; GISEL-NEXT: s_mov_b32 s7, s4 -; GISEL-NEXT: v_maxmin_f16 v0, s0, s1, v0 -; GISEL-NEXT: global_store_b16 v1, v0, s[6:7] -; GISEL-NEXT: s_endpgm +; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX11-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0 +; GISEL-GFX11-NEXT: global_store_b16 v1, v0, s[6:7] +; GISEL-GFX11-NEXT: s_endpgm +; +; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX12-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, s0, s1, v0 +; SDAG-GFX12-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX12-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_max_num_f16 s0, s0, s1 +; GISEL-GFX12-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX12-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_min_num_f16 s0, s0, s2 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX12-NEXT: global_store_b16 v1, v0, s[6:7] +; GISEL-GFX12-NEXT: s_endpgm %smax = call half @llvm.maxnum.f16(half %a, half %b) %sminmax = call half @llvm.minnum.f16(half %smax, half %c) store half %sminmax, ptr addrspace(1) %out @@ -308,23 +526,49 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b } define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { -; SDAG-LABEL: test_minmax_commuted_f16_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; SDAG-NEXT: v_maxmin_f16 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_minmax_commuted_f16_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_max_f16_e32 v0, v0, v0 -; GISEL-NEXT: v_max_f16_e32 v1, v1, v1 -; GISEL-NEXT: v_max_f16_e32 v2, v2, v2 -; GISEL-NEXT: v_maxmin_f16 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) %minmax = call half @llvm.minnum.f16(half %c, half %max) ret half %minmax @@ -335,29 +579,60 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_maxmin_f16_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) %maxmin = call half @llvm.maxnum.f16(half %min, half %c) ret half %maxmin } define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { -; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; SDAG-NEXT: v_minmax_f16 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_max_f16_e32 v0, v0, v0 -; GISEL-NEXT: v_max_f16_e32 v1, v1, v1 -; GISEL-NEXT: v_max_f16_e32 v2, v2, v2 -; GISEL-NEXT: v_minmax_f16 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) %maxmin = call half @llvm.maxnum.f16(half %c, half %min) ret half %maxmin @@ -370,6 +645,17 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 ; GFX11-NEXT: v_med3_f16 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_med3_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_num_f16 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call half @llvm.minnum.f16(half %x, half %y) %tmp1 = call half @llvm.maxnum.f16(half %x, half %y) %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z) diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll new file mode 100644 index 000000000000..a9b8663a48de --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908 + +define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { +; GFX942-LABEL: matmul_kernel: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: s_branch .LBB0_2 +; GFX942-NEXT: .LBB0_1: ; %bb2 +; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX942-NEXT: s_or_b32 s4, s3, 1 +; GFX942-NEXT: s_ashr_i32 s5, s3, 31 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: s_and_b32 s3, s5, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3] +; GFX942-NEXT: s_cbranch_execz .LBB0_4 +; GFX942-NEXT: .LBB0_2: ; %bb +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_cbranch_vccz .LBB0_1 +; GFX942-NEXT: ; %bb.3: +; GFX942-NEXT: ; implicit-def: $sgpr3 +; GFX942-NEXT: .LBB0_4: ; %common.ret +; GFX942-NEXT: s_endpgm +; +; GFX908-LABEL: matmul_kernel: +; GFX908: ; %bb.0: ; %entry +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: s_mov_b32 s2, 0 +; GFX908-NEXT: s_mov_b32 s3, 0 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_cmp_lg_u32 s0, 0 +; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX908-NEXT: s_branch .LBB0_2 +; GFX908-NEXT: .LBB0_1: ; %bb2 +; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX908-NEXT: s_or_b32 s4, s3, 1 +; GFX908-NEXT: s_ashr_i32 s5, s3, 31 +; GFX908-NEXT: s_mov_b32 s3, s2 +; GFX908-NEXT: s_nop 3 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_mov_b32_e32 v5, s3 +; GFX908-NEXT: v_mov_b32_e32 v4, s2 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: s_and_b32 s3, s5, s4 +; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3] +; GFX908-NEXT: s_cbranch_execz .LBB0_4 +; GFX908-NEXT: .LBB0_2: ; %bb +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_cbranch_vccz .LBB0_1 +; GFX908-NEXT: ; %bb.3: +; GFX908-NEXT: ; implicit-def: $sgpr3 +; GFX908-NEXT: .LBB0_4: ; %common.ret +; GFX908-NEXT: s_endpgm +entry: + br label %bb + +bb: + %i = phi { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } [ %i10, %bb2 ], [ zeroinitializer, %entry ] + %i1 = phi i32 [ %i5, %bb2 ], [ 0, %entry ] + %c0 = icmp ne i32 %a0, 0 + br i1 %c0, label %bb2, label %bb11 + +bb2: + %i3 = or i32 %i1, 1 + %i4 = icmp slt i32 %i1, 0 + %i5 = select i1 %i4, i32 %i3, i32 0 + %i6 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 123 + %i7 = insertelement <4 x float> zeroinitializer, float %i6, i32 0 + %i8 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0) + %i9 = extractelement <4 x float> %i8, i32 0 + %i10 = insertvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } zeroinitializer, float %i9, 123 + br label %bb + +bb11: + %c1 = icmp ne i32 %a1, 0 + br i1 %c1, label %bb12, label %common.ret + +common.ret: + ret void + +bb12: + %i13 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 0 + %i14 = insertelement <4 x float> zeroinitializer, float %i13, i32 0 + %i15 = insertelement <4 x float> %i14, float 0.000000e+00, i32 0 + %i16 = insertelement <4 x float> %i15, float 0.000000e+00, i32 0 + br label %common.ret +} + +; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir new file mode 100644 index 000000000000..5c83170563e5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir @@ -0,0 +1,235 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE + +... +--- +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: S_BITCMP1_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:agpr_32 = PHI [[COPY1]], %bb.0, %24, %bb.3 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %11, %bb.3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[PHI1]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_ASHR_I32_]], killed [[S_OR_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, [[S_AND_B32_]], %bb.2 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_]].sub0, %bb.2 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.1, [[S_MOV_B64_1]], %bb.2 + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI4]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_1]], 1, implicit $exec + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: S_ENDPGM 0 + ; + ; COALESCE-LABEL: name: test + ; COALESCE: bb.0: + ; COALESCE-NEXT: successors: %bb.1(0x80000000) + ; COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 + ; COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc + ; COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec + ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec + ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.1: + ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc + ; COALESCE-NEXT: S_BRANCH %bb.2 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.2: + ; COALESCE-NEXT: successors: %bb.3(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc + ; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc + ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc + ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]] + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.3: + ; COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec + ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec + ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + ; COALESCE-NEXT: S_BRANCH %bb.4 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.4: + ; COALESCE-NEXT: successors: %bb.5(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.5: + ; COALESCE-NEXT: S_ENDPGM 0 + ; + ; GFX908-COALESCE-LABEL: name: test + ; GFX908-COALESCE: bb.0: + ; GFX908-COALESCE-NEXT: successors: %bb.1(0x80000000) + ; GFX908-COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; GFX908-COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 + ; GFX908-COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX908-COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.1: + ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc + ; GFX908-COALESCE-NEXT: S_BRANCH %bb.2 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.2: + ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc + ; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc + ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]] + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.3: + ; GFX908-COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec + ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec + ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + ; GFX908-COALESCE-NEXT: S_BRANCH %bb.4 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.4: + ; GFX908-COALESCE-NEXT: successors: %bb.5(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.5: + ; GFX908-COALESCE-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1 + liveins: $sgpr4_sgpr5 + + %0:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + S_BITCMP1_B32 killed %1, 0, implicit-def $scc + %2:sgpr_32 = S_MOV_B32 0 + %3:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_32 = IMPLICIT_DEF + %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %3, implicit $exec + %7:sreg_64_xexec = V_CMP_NE_U32_e64 %6, 1, implicit $exec + + bb.1: + successors: %bb.2, %bb.3 + + %8:vgpr_32 = PHI %4, %bb.0, %9, %bb.3 + %10:sreg_32 = PHI %2, %bb.0, %11, %bb.3 + %12:agpr_32 = COPY %8 + %13:sreg_64 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, %7, implicit-def $scc + S_CBRANCH_VCCNZ %bb.3, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + + %14:sreg_32 = S_OR_B32 %10, 1, implicit-def dead $scc + %15:sreg_32 = S_ASHR_I32 %10, 31, implicit-def dead $scc + %16:sreg_32 = S_AND_B32 killed %15, killed %14, implicit-def dead $scc + %17:vreg_128_align2 = REG_SEQUENCE %8, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3 + %18:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1 + %19:vreg_64_align2 = COPY %18 + %20:areg_128_align2 = COPY %17 + %21:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %19, %19, killed %20, 0, 0, 0, implicit $mode, implicit $exec + %22:vgpr_32 = COPY %21.sub0 + %23:sreg_64 = S_MOV_B64 0 + + bb.3: + successors: %bb.4, %bb.1 + + %11:sreg_32 = PHI %5, %bb.1, %16, %bb.2 + %24:agpr_32 = PHI %12, %bb.1, %21.sub0, %bb.2 + %25:sreg_64_xexec = PHI %13, %bb.1, %23, %bb.2 + %9:vgpr_32 = COPY %24 + %26:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %25, implicit $exec + %27:sreg_64_xexec = V_CMP_NE_U32_e64 %26, 1, implicit $exec + $vcc = S_AND_B64 $exec, %27, implicit-def $scc + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + + bb.5: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir new file mode 100644 index 000000000000..49c0aaf9fb39 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir @@ -0,0 +1,182 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE + +... +--- +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: S_BITCMP0_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_4:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_4]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_3:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MFMA_F32_16X16X16F16_e64_3]].sub0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:agpr_32 = PHI [[V_ACCVGPR_WRITE_B32_e64_]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_3]].sub0, %bb.2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]] + ; CHECK-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_PACK_B32_F16_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + ; CHECK-NEXT: S_ENDPGM 0 + ; + ; COALESCE-LABEL: name: test + ; COALESCE: bb.0: + ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0 + ; COALESCE-NEXT: S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; COALESCE-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.1: + ; COALESCE-NEXT: successors: %bb.3(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; COALESCE-NEXT: S_BRANCH %bb.3 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.2: + ; COALESCE-NEXT: successors: %bb.3(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.3: + ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec + ; COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + ; COALESCE-NEXT: S_ENDPGM 0 + ; + ; GFX908-COALESCE-LABEL: name: test + ; GFX908-COALESCE: bb.0: + ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX908-COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0 + ; GFX908-COALESCE-NEXT: S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; GFX908-COALESCE-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.1: + ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: S_BRANCH %bb.3 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.2: + ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.3: + ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + ; GFX908-COALESCE-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.2, %bb.1 + liveins: $sgpr4_sgpr5 + + %0:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sgpr_32 = S_MOV_B32 0 + S_BITCMP0_B32 killed %1, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + + bb.1: + successors: %bb.3 + + %3:sgpr_32 = COPY %2 + %4:vgpr_32 = COPY %3, implicit $exec + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + + %5:sgpr_32 = S_MOV_B32 0 + %6:vgpr_32 = COPY %5 + %7:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %8:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %9:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %10:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %11:areg_128_align2 = REG_SEQUENCE %7, %subreg.sub0, %8, %subreg.sub1, %9, %subreg.sub2, %10, %subreg.sub3 + %12:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %5, %subreg.sub1 + %13:vreg_64_align2 = COPY %12 + %14:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %11, 0, 0, 0, implicit $mode, implicit $exec + %15:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %14, 0, 0, 0, implicit $mode, implicit $exec + %16:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %15, 0, 0, 0, implicit $mode, implicit $exec + %17:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %16, 0, 0, 0, implicit $mode, implicit $exec + %18:vgpr_32 = COPY %17.sub0 + %19:vgpr_32 = COPY %18 + + bb.3: + %20:vgpr_32 = PHI %4, %bb.1, %19, %bb.2 + %21:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, %20, 0, 0, implicit $mode, implicit $exec + %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, %2, 0, 0, implicit $mode, implicit $exec + %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %24:vreg_64_align2 = REG_SEQUENCE %22, %subreg.sub0, killed %23, %subreg.sub1 + %25:sgpr_128 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1, %2, %subreg.sub2, %2, %subreg.sub3 + %26:vreg_64_align2 = COPY %24 + BUFFER_STORE_DWORDX2_OFFSET_exact killed %26, killed %25, %2, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 98d5f3097153..a2a0107a6f7d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1372,20 +1372,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf000 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf800 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[3:4] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v4 ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc @@ -1416,32 +1415,32 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v12 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v10 ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 ; GFX9-NEXT: s_movk_i32 s0, 0xf000 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[34:35] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: Offset64: @@ -1477,8 +1476,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[0:1], off @@ -1517,25 +1515,25 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo -; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[6:7], v[4:5], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[2:3], off +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:2048 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35] ; GFX11-NEXT: s_endpgm entry: @@ -2408,18 +2406,17 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: v_mov_b32_e32 v3, 3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v0 ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; @@ -2450,14 +2447,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc @@ -2490,15 +2486,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo @@ -2525,19 +2520,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll index ba428df273db..a439f8df10a2 100644 --- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll +++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll @@ -3,32 +3,17 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_cs float @v_s_exp_f32(float inreg %src) { -; GFX12-SDAG-LABEL: v_s_exp_f32: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 -; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42800000, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-SDAG-NEXT: s_add_f32 s0, s0, s1 -; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0xffffffc0, 0 -; GFX12-SDAG-NEXT: v_s_exp_f32 s0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, s1 -; GFX12-SDAG-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-LABEL: v_s_exp_f32: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42800000, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-GISEL-NEXT: s_add_f32 s0, s0, s1 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0 -; GFX12-GISEL-NEXT: v_s_exp_f32 s0, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: ; return to shader part epilog +; GFX12-LABEL: v_s_exp_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000 +; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX12-NEXT: s_add_f32 s0, s0, s1 +; GFX12-NEXT: s_cselect_b32 s1, 0xffffffc0, 0 +; GFX12-NEXT: v_s_exp_f32 s0, s0 +; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-NEXT: v_ldexp_f32 v0, s0, s1 +; GFX12-NEXT: ; return to shader part epilog %result = call float @llvm.exp2.f32(float %src) ret float %result } @@ -88,16 +73,16 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) { ; GFX12-GISEL-LABEL: v_s_log_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) -; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-GISEL-NEXT: v_ldexp_f32 v0, s0, s2 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %result = call float @llvm.log2.f32(float %src) ret float %result @@ -322,19 +307,18 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) { ; ; GFX12-GISEL-LABEL: srcmods_abs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_bitset0_b32 s0, 31 +; GFX12-GISEL-NEXT: s_and_b32 s1, s0, 0x7fffffff ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s1, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-GISEL-NEXT: v_ldexp_f32 v0, |s0|, s2 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %abs = call float @llvm.fabs.f32(float %src) %result = call float @llvm.log2.f32(float %abs) @@ -362,19 +346,18 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) { ; ; GFX12-GISEL-LABEL: srcmods_neg_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x80000000 +; GFX12-GISEL-NEXT: s_xor_b32 s1, s0, 0x80000000 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0 -; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1 -; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) -; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_cmp_lt_f32 s1, 0x800000 +; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5 +; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-GISEL-NEXT: v_ldexp_f32 v0, -s0, s2 +; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %neg = fneg float %src %result = call float @llvm.log2.f32(float %neg) diff --git a/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir new file mode 100644 index 000000000000..3879f6dccf9d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir @@ -0,0 +1,85 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s + +# This used to assert due to trying to rematerialize V_MOV_B64_PSEUDO +# at copy to $vgpr1. This would assert since this would clobber the +# live value in $vgpr0. + +--- +name: rematerialize_physreg_sub_def_already_live_at_def_assert +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = COPY [[V_MOV_B]].sub1 + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1 + %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + %1:vgpr_32 = COPY %0.sub1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = COPY %1 + SI_RETURN implicit $vgpr0, implicit killed $vgpr1 +... + +# Same as previous, except with an IMPLICIT_DEF +--- +name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr1 = COPY [[DEF]].sub1 + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1 + %0:vreg_64 = IMPLICIT_DEF + %1:vgpr_32 = COPY %0.sub1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr1 = COPY %1 + SI_RETURN implicit $vgpr0, implicit killed $vgpr1 +... + +--- +name: rematerialize_physreg_sub_def_no_live_sub_def_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1 + ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1 + %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + %1:vgpr_32 = COPY %0.sub1 + $vgpr1 = COPY %1 + SI_RETURN implicit killed $vgpr1 +... + +--- +name: rematerialize_physreg_sub_def_no_live_sub_def_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead $vgpr1_vgpr2 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1 + ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1 + %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec + %1:vgpr_32 = COPY %0.sub0 + $vgpr1 = COPY %1 + SI_RETURN implicit killed $vgpr1 +... diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 3e8768c98b5c..96dd6276f7e3 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1065,100 +1065,37 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_48: -; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN-IR-NEXT: s_mov_b32 s15, 0 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s1, s1 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24 -; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 24 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 16 -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 -; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[6:7] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[12:13] -; GCN-IR-NEXT: s_sub_u32 s16, s14, s20 -; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19] -; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s16, 1 -; GCN-IR-NEXT: s_addc_u32 s19, s17, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 -; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s18 -; GCN-IR-NEXT: s_add_u32 s18, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] -; GCN-IR-NEXT: s_add_u32 s12, s8, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while -; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s18, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s19, s17 -; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s8, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s16, s16, s14 -; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 -; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 -; GCN-IR-NEXT: .LBB9_4: ; %Flow4 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] -; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: s_waitcnt expcnt(0) -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_sext_i32_i16 s1, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: v_alignbit_b32 v0, s1, v0, 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_sext_i32_i16 s0, s3 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s2 +; GCN-IR-NEXT: v_alignbit_b32 v2, s0, v2, 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v4 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll index e0d0ddce208c..ddf6297bc27a 100644 --- a/llvm/test/CodeGen/AMDGPU/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/smed3.ll @@ -1,6 +1,8 @@ ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -98,6 +100,8 @@ declare i64 @llvm.smin.i64(i64, i64) ; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}} ; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]] ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17 +; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -686,6 +690,8 @@ bb: ; VI: v_max_i16 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_smed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -707,6 +713,8 @@ bb: ; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1: ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_smed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index cb8f82db92bb..23364e860d15 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1188,109 +1188,39 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_48: -; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s1, s1 ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 24 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], 16 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 16 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 -; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 -; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[6:7] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[2:3] -; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17] -; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 -; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s15, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 -; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s16 -; GCN-IR-NEXT: s_add_u32 s18, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s8, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 -; GCN-IR-NEXT: s_mov_b32 s9, 0 -; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while -; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s8, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s8, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 -; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 -; GCN-IR-NEXT: .LBB9_4: ; %Flow4 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x9 -; GCN-IR-NEXT: s_mul_i32 s4, s6, s11 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GCN-IR-NEXT: s_mul_i32 s4, s7, s10 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GCN-IR-NEXT: s_mul_i32 s4, s6, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 -; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s14, -1 -; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v1, off, s[12:15], 0 +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_alignbit_b32 v0, s5, v0, 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s2 +; GCN-IR-NEXT: v_alignbit_b32 v2, s3, v2, 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v5, v2, v0 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 30, v5 +; GCN-IR-NEXT: v_or_b32_e32 v5, 1, v5 +; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 +; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 +; GCN-IR-NEXT: v_mad_f32 v3, -v4, v1, v3 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| +; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll new file mode 100644 index 000000000000..f52f1164f2ba --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +; Reduce a 64-bit sub by a constant if we know the low 32-bits are all +; zero. + +; sub i64:x, K if computeTrailingZeros(K) >= 32 +; => build_pair (sub x.hi, K.hi), x.lo + +define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) { +; GFX9-LABEL: s_sub_i64_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 0xfffc0000 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub i64 %reg, 1125899906842624 ; (1 << 50) + ret i64 %sub +} + +define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) { +; GFX9-LABEL: s_sub_i64_const_low_bits_known0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub i64 %reg, 4294967296 ; (1 << 32) + ret i64 %sub +} + +define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) { +; GFX9-LABEL: s_sub_i64_const_low_bits_known0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, -2 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub i64 %reg, 8589934592 ; (1 << 33) + ret i64 %sub +} + +define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) { +; GFX9-LABEL: s_sub_i64_const_low_bits_known0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63) + ret i64 %sub +} + +define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_4(i64 inreg %reg) { +; GFX9-LABEL: s_sub_i64_const_low_bits_known0_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000 + ret i64 %sub +} + +define i64 @v_sub_i64_const_low_bits_known0_0(i64 %reg) { +; GFX9-LABEL: v_sub_i64_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0xfffc0000, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %reg, 1125899906842624 ; (1 << 50) + ret i64 %sub +} + +define i64 @v_sub_i64_const_low_bits_known0_1(i64 %reg) { +; GFX9-LABEL: v_sub_i64_const_low_bits_known0_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %reg, 4294967296 ; (1 << 32) + ret i64 %sub +} + +define i64 @v_sub_i64_const_low_bits_known0_2(i64 %reg) { +; GFX9-LABEL: v_sub_i64_const_low_bits_known0_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, -2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %reg, 8589934592 ; (1 << 33) + ret i64 %sub +} + +define i64 @v_sub_i64_const_low_bits_known0_3(i64 %reg) { +; GFX9-LABEL: v_sub_i64_const_low_bits_known0_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63) + ret i64 %sub +} + +define i64 @v_sub_i64_const_low_bits_known0_4(i64 %reg) { +; GFX9-LABEL: v_sub_i64_const_low_bits_known0_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000 + ret i64 %sub +} + +define amdgpu_ps i64 @s_sub_i64_const_high_bits_known0_0(i64 inreg %reg) { +; GFX9-LABEL: s_sub_i64_const_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s0, 1 +; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub i64 %reg, 4294967295 ; (1 << 31) + ret i64 %sub +} + +define i64 @v_sub_i64_const_high_bits_known0_0(i64 %reg) { +; GFX9-LABEL: v_sub_i64_const_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %reg, 4294967295 ; (1 << 31) + ret i64 %sub +} + +define <2 x i64> @v_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) { +; GFX9-LABEL: v_sub_v2i64_splat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32) + ret <2 x i64> %sub +} + +define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { +; GFX9-LABEL: v_sub_v2i64_nonsplat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, -2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33) + ret <2 x i64> %sub +} + +define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { +; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32) + ret <2 x i64> %sub +} + +define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { +; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s3, s3, -2 +; GFX9-NEXT: ; return to shader part epilog + %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33) + ret <2 x i64> %sub +} + +; We could reduce this to use a 32-bit sub if we use computeKnownBits +define i64 @v_sub_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) { +; GFX9-LABEL: v_sub_i64_variable_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %zext.offset.hi32 = zext i32 %offset.hi32 to i64 + %in.high.bits = shl i64 %zext.offset.hi32, 32 + %sub = sub i64 %reg, %in.high.bits + ret i64 %sub +} + +; We could reduce this to use a 32-bit sub if we use computeKnownBits +define amdgpu_ps i64 @s_sub_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) { +; GFX9-LABEL: s_sub_i64_variable_high_bits_known0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_u32 s0, s0, 0 +; GFX9-NEXT: s_subb_u32 s1, s1, s2 +; GFX9-NEXT: ; return to shader part epilog + %zext.offset.hi32 = zext i32 %offset.hi32 to i64 + %in.high.bits = shl i64 %zext.offset.hi32, 32 + %sub = sub i64 %reg, %in.high.bits + ret i64 %sub +} diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir new file mode 100644 index 000000000000..831570800d06 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir @@ -0,0 +1,94 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=4 -verify-regalloc -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s + +# This testcase hit a situation where greedy would hit a use after +# free during last chance recoloring. This case successfully allocates +# after, but is extremely sensitive to the exact allocation ordering. + +--- +name: swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds +tracksRegLiveness: true +stack: + - { id: 0, size: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr4_vgpr5, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1) + ; CHECK-NEXT: renamable $vgpr4 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr5 = V_FMA_F32_e64 0, $vgpr7, 0, $vgpr7, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SI_SPILL_AV128_SAVE $vgpr6_vgpr7_vgpr8_vgpr9, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr6 = V_FMA_F32_e64 0, killed $vgpr8, 0, $vgpr8, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr7 = IMPLICIT_DEF + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SI_SPILL_V128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr5 + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5) + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 killed renamable $vgpr4_vgpr5, renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1) + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3 + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vreg_64_align2 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1) + undef %4.sub0:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub0, 0, %3.sub0, 0, %0.sub3, 0, 0, implicit $mode, implicit $exec + %4.sub1:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub1, 0, %3.sub1, 0, %0.sub2, 0, 0, implicit $mode, implicit $exec + %4.sub2:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub2, 0, %3.sub2, 0, %0.sub1, 0, 0, implicit $mode, implicit $exec + %4.sub3:vreg_128_align2 = IMPLICIT_DEF + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + %5:vgpr_32 = V_FMA_F32_e64 0, %4.sub2, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec + %6:vgpr_32 = V_TRUNC_F32_e32 %5, implicit $mode, implicit $exec + undef %7.sub3:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub3, 0, %3.sub3, 0, 0, implicit $mode, implicit $exec + %7.sub2:vreg_128_align2 = nofpexcept V_FMA_F32_e64 1, %6, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec + %7.sub0:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub0, 0, %3.sub0, 0, 0, implicit $mode, implicit $exec + %8:vgpr_32 = V_FMA_F32_e64 0, %4.sub1, 0, %4.sub1, 0, %3.sub1, 0, 0, implicit $mode, implicit $exec + %9:vreg_128_align2 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5) + GLOBAL_STORE_DWORDX4 %1, %7, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1) + + bb.2: + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %0 + SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3 + +... diff --git a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll new file mode 100644 index 000000000000..f0b3d334af67 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll @@ -0,0 +1,23 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s +; RUN: FileCheck -check-prefix=ERR %s < %t.err + +; FIXME: This error will be fixed by supporting arbitrary divergent +; dynamic allocas by performing a wave umax of the size. + +; ERR: error: <unknown>:0:0: in function move_to_valu_assert_srd_is_physreg_swdev503538 i32 (ptr addrspace(1)): illegal VGPR to SGPR copy + +; CHECK: ; illegal copy v0 to s32 + +define i32 @move_to_valu_assert_srd_is_physreg_swdev503538(ptr addrspace(1) %ptr) { +entry: + %idx = load i32, ptr addrspace(1) %ptr, align 4 + %zero = extractelement <4 x i32> zeroinitializer, i32 %idx + %alloca = alloca [2048 x i8], i32 %zero, align 8, addrspace(5) + %ld = load i32, ptr addrspace(5) %alloca, align 8 + call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 2048, i1 false) + ret i32 %ld +} + +declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0 + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll index 557d023c45f9..4726e81ceb8c 100644 --- a/llvm/test/CodeGen/AMDGPU/umed3.ll +++ b/llvm/test/CodeGen/AMDGPU/umed3.ll @@ -1,6 +1,8 @@ ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -84,6 +86,8 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr add ; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}} ; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]] ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17 +; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -707,6 +711,8 @@ bb: ; VI: v_max_u16 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_umed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -728,6 +734,8 @@ bb: ; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1: ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_umed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir index ad4ad6df73e7..b663acb8ce3f 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir @@ -67,7 +67,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_WAITCNT 0 ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0_lo16, 16 + ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0, 16 $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec CFI_INSTRUCTION offset $vgpr0, 16 |
