summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll110
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll769
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll95
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir129
-rw-r--r--llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll193
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll5208
-rw-r--r--llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir54
-rw-r--r--llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll2610
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-scratch.ll108
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma.f16.ll127
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3.ll134
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin3.ll200
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll2100
-rw-r--r--llvm/test/CodeGen/AMDGPU/fract-match.ll428
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-saddr-load.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir204
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll19
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.ll1230
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.frexp.ll262
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log.ll1209
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log10.ll1209
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll1542
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_64_32.ll663
-rw-r--r--llvm/test/CodeGen/AMDGPU/maximumnum.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/minimumnum.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll478
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir235
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir182
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll144
-rw-r--r--llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll97
-rw-r--r--llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir85
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll117
-rw-r--r--llvm/test/CodeGen/AMDGPU/smed3.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll132
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll193
-rw-r--r--llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir94
-rw-r--r--llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll23
-rw-r--r--llvm/test/CodeGen/AMDGPU/umed3.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir2
51 files changed, 14907 insertions, 6196 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
deleted file mode 100644
index aefcad491073..000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) {
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
- %n = load i32, ptr addrspace(1) %gep
- %alloca = alloca i32, i32 %n, align 4, addrspace(5)
- store volatile i32 123, ptr addrspace(5) %alloca
- ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) {
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
- %n = load i32, ptr addrspace(1) %gep
- %alloca = alloca i32, i32 %n, addrspace(5)
- store volatile i32 123, ptr addrspace(5) %alloca
- ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca
-
-define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) {
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
- %n = load i32, ptr addrspace(1) %gep
- %alloca = alloca i32, i32 %n, align 64, addrspace(5)
- store volatile i32 123, ptr addrspace(5) %alloca
- ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) {
- %alloca = alloca i32, i32 %n, align 4, addrspace(5)
- store volatile i32 456, ptr addrspace(5) %alloca
- ret void
-}
-
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) {
- %alloca = alloca i32, i32 %n, addrspace(5)
- store volatile i32 456, ptr addrspace(5) %alloca
- ret void
-}
-; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64)
-; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64
-; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca
-
-define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) {
- %alloca = alloca i32, i32 %n, align 64, addrspace(5)
- store volatile i32 456, ptr addrspace(5) %alloca
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index b0f2aac9a42d..7cafa2f608a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -3990,6 +3990,116 @@ bb:
ret void
}
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: v_add3_u32 v0, s2, v0, -16
+; GFX9-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-NEXT: scratch_store_dword v0, v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT: scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940: ; %bb.0: ; %bb
+; GFX940-NEXT: v_add_u32_e32 v0, s1, v0
+; GFX940-NEXT: v_add3_u32 v0, s0, v0, -16
+; GFX940-NEXT: v_mov_b32_e32 v1, 15
+; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX9: ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, s3, v0
+; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT: v_add3_u32 v0, s2, v0, -16
+; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT: scratch_store_dword v0, v1, off
+; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX10: ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT: s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT: s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, s3, v0
+; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v1, off offset:-16
+; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT: s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX940: ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v0, s1, v0
+; UNALIGNED_GFX940-NEXT: v_add3_u32 v0, s0, v0, -16
+; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT: s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX11: ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc
+; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT: s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX12: ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT: s_endpgm
+bb:
+ %add1 = add nsw i32 %sidx, %vidx
+ %add2 = add nsw i32 %add1, -16
+ %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+ store volatile i32 15, ptr addrspace(5) %gep, align 4
+ ret void
+}
+
define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
; GFX9-LABEL: sgpr_base_negative_offset:
; GFX9: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 0577117e9d9e..d81faf91801b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -10,10 +10,10 @@ define float @v_pow_f32(float %x, float %y) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -25,19 +25,19 @@ define float @v_pow_f32(float %x, float %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -49,19 +49,19 @@ define float @v_pow_f32(float %x, float %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -73,17 +73,18 @@ define float @v_pow_f32(float %x, float %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
@@ -91,32 +92,34 @@ define float @v_pow_f32(float %x, float %y) {
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_pow_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%pow = call float @llvm.pow.f32(float %x, float %y)
ret float %pow
@@ -127,111 +130,114 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v4, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 5, v4
; GFX6-NEXT: v_log_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v4
; GFX6-NEXT: v_log_f32_e32 v1, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, 0x42000000
-; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX6-NEXT: v_sub_f32_e32 v0, v0, v7
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x42000000
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX6-NEXT: v_sub_f32_e32 v0, v0, v6
+; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5]
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
; GFX6-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
; GFX6-NEXT: v_sub_f32_e32 v1, v1, v5
-; GFX6-NEXT: v_mov_b32_e32 v7, 0x42800000
+; GFX6-NEXT: v_mov_b32_e32 v6, 0x42800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5]
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_add_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_not_b32_e32 v4, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v4
; GFX8-NEXT: v_log_f32_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-NEXT: v_ldexp_f32 v1, v1, v4
; GFX8-NEXT: v_log_f32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v6, 0x42000000
-; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX8-NEXT: v_sub_f32_e32 v0, v0, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x42000000
+; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5]
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
; GFX8-NEXT: v_sub_f32_e32 v1, v1, v5
-; GFX8-NEXT: v_mov_b32_e32 v7, 0x42800000
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x42800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5]
; GFX8-NEXT: v_exp_f32_e32 v0, v0
; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
; GFX8-NEXT: v_exp_f32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT: v_not_b32_e32 v4, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX8-NEXT: v_ldexp_f32 v1, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 5, v4
; GFX9-NEXT: v_log_f32_e32 v0, v0
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-NEXT: v_ldexp_f32 v1, v1, v4
; GFX9-NEXT: v_log_f32_e32 v1, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, 0x42000000
-; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX9-NEXT: v_sub_f32_e32 v0, v0, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x42000000
+; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX9-NEXT: v_sub_f32_e32 v0, v0, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5]
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
; GFX9-NEXT: v_sub_f32_e32 v1, v1, v5
-; GFX9-NEXT: v_mov_b32_e32 v7, 0x42800000
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x42800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX9-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5]
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-NEXT: v_exp_f32_e32 v1, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT: v_not_b32_e32 v4, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-NEXT: v_ldexp_f32 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f32:
@@ -239,10 +245,12 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s4
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT: v_ldexp_f32 v1, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s4
; GFX10-NEXT: v_log_f32_e32 v0, v0
@@ -257,46 +265,54 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s4
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s4
; GFX10-NEXT: v_exp_f32_e32 v0, v0
; GFX10-NEXT: v_exp_f32_e32 v1, v1
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT: v_ldexp_f32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_pow_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s0
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v5
; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: v_log_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v4, 5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s0
-; GFX11-NEXT: v_exp_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v1, v1
+; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_exp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y)
ret <2 x float> %pow
@@ -316,9 +332,9 @@ define half @v_pow_f16(half %x, half %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -388,18 +404,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; GFX6-NEXT: v_not_b32_e32 v3, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
; GFX6-NEXT: v_add_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -508,17 +524,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
; GFX6-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT: v_not_b32_e32 v5, 63
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
; GFX6-NEXT: v_exp_f32_e32 v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v6
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v1, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -634,17 +650,17 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT: v_not_b32_e32 v5, 63
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX6-NEXT: v_add_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_exp_f32_e32 v1, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6
+; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -764,17 +780,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT: v_not_b32_e32 v5, 63
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
; GFX6-NEXT: v_exp_f32_e32 v2, v2
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v1, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v6
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -885,10 +901,10 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT: v_ldexp_f32_e64 v0, |v0|, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -900,19 +916,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_f32_fabs_lhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -924,19 +940,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_f32_fabs_lhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -948,17 +964,18 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_f32_fabs_lhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
@@ -966,9 +983,9 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_pow_f32_fabs_lhs:
@@ -976,23 +993,24 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
%pow = call float @llvm.pow.f32(float %fabs.x, float %y)
@@ -1004,10 +1022,10 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1019,19 +1037,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_f32_fabs_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1043,19 +1061,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_f32_fabs_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1067,17 +1085,18 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_f32_fabs_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
@@ -1085,32 +1104,34 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_pow_f32_fabs_rhs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs.y = call float @llvm.fabs.f32(float %y)
%pow = call float @llvm.pow.f32(float %x, float %fabs.y)
@@ -1122,10 +1143,10 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT: v_ldexp_f32_e64 v0, |v0|, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1137,19 +1158,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1161,19 +1182,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1185,17 +1206,18 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
@@ -1203,9 +1225,9 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs:
@@ -1213,23 +1235,24 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fabs.y = call float @llvm.fabs.f32(float %y)
@@ -1241,10 +1264,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
; GFX6-LABEL: v_pow_f32_sgpr_vgpr:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1
+; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT: v_ldexp_f32_e32 v1, s0, v1
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1256,18 +1279,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_pow_f32_sgpr_vgpr:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT: v_mul_f32_e32 v1, s0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT: v_ldexp_f32 v1, s0, v1
; GFX8-NEXT: v_log_f32_e32 v1, v1
; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1279,18 +1302,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_pow_f32_sgpr_vgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT: v_mul_f32_e32 v1, s0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT: v_ldexp_f32 v1, s0, v1
; GFX9-NEXT: v_log_f32_e32 v1, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1302,49 +1325,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_pow_f32_sgpr_vgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX10-NEXT: v_mul_f32_e32 v1, s0, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT: v_ldexp_f32 v1, s0, v1
; GFX10-NEXT: v_log_f32_e32 v1, v1
; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_pow_f32_sgpr_vgpr:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX11-NEXT: v_mul_f32_e32 v1, s0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f32 v1, s0, v1
; GFX11-NEXT: v_log_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%pow = call float @llvm.pow.f32(float %x, float %y)
ret float %pow
@@ -1354,10 +1379,10 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
; GFX6-LABEL: v_pow_f32_vgpr_sgpr:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1369,18 +1394,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_pow_f32_vgpr_sgpr:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1392,18 +1417,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_pow_f32_vgpr_sgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1415,16 +1440,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_pow_f32_vgpr_sgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1432,31 +1458,33 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_pow_f32_vgpr_sgpr:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%pow = call float @llvm.pow.f32(float %x, float %y)
ret float %pow
@@ -1466,10 +1494,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
; GFX6-LABEL: v_pow_f32_sgpr_sgpr:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX6-NEXT: v_ldexp_f32_e32 v0, s0, v0
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1481,18 +1509,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_pow_f32_sgpr_sgpr:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX8-NEXT: v_ldexp_f32 v0, s0, v0
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1504,18 +1532,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_pow_f32_sgpr_sgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX9-NEXT: v_ldexp_f32 v0, s0, v0
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1527,49 +1555,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_pow_f32_sgpr_sgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX10-NEXT: v_ldexp_f32 v0, s0, v0
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_pow_f32_sgpr_sgpr:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX11-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f32 v0, s0, v0
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: ; return to shader part epilog
%pow = call float @llvm.pow.f32(float %x, float %y)
ret float %pow
@@ -1580,10 +1610,10 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e64 v0, -v0, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT: v_ldexp_f32_e64 v0, -v0, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1595,19 +1625,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_f32_fneg_lhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT: v_mul_f32_e64 v0, -v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT: v_ldexp_f32 v0, -v0, v2
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1619,19 +1649,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_f32_fneg_lhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT: v_mul_f32_e64 v0, -v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT: v_ldexp_f32 v0, -v0, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1643,17 +1673,18 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_f32_fneg_lhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, -v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT: v_ldexp_f32 v0, -v0, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
@@ -1661,9 +1692,9 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_pow_f32_fneg_lhs:
@@ -1671,23 +1702,24 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT: v_mul_f32_e64 v0, -v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f32 v0, -v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg float %x
%pow = call float @llvm.pow.f32(float %neg.x, float %y)
@@ -1699,10 +1731,10 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1714,19 +1746,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_not_b32_e32 v1, 63
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_f32_fneg_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1738,19 +1770,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_f32_fneg_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000
; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1762,17 +1794,18 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_f32_fneg_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
@@ -1780,32 +1813,34 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_pow_f32_fneg_rhs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, -v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg float %y
%pow = call float @llvm.pow.f32(float %x, float %neg.y)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
index 55015c6d13d8..cdb67caea12c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
@@ -20,8 +20,8 @@ body: |
; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
- ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
- ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]]
;
; GFX11-FAKE16-LABEL: name: fcmp_false_f16
; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
@@ -58,8 +58,8 @@ body: |
; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
- ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
- ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]]
;
; GFX11-FAKE16-LABEL: name: fcmp_true_f16
; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
index 4241f945a87d..ed811d37c3d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
@@ -20,8 +20,8 @@ body: |
; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
- ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
- ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_fake16_e64_]]
;
; GFX11-FAKE16-LABEL: name: fcmp_false_f16
; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
@@ -58,8 +58,8 @@ body: |
; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
- ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
- ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+ ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_fake16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_fake16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+ ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_fake16_e64_]]
;
; GFX11-FAKE16-LABEL: name: fcmp_true_f16
; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index eeb7b138fde3..fe002d69faf6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -18,9 +18,9 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-NEXT: v_exp_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_not_b32_e32 v1, 63
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -75,53 +75,80 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
}
define float @v_powi_f32(float %l, i32 %r) {
-; GFX78-LABEL: v_powi_f32:
-; GFX78: ; %bb.0:
-; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX78-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX78-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX78-NEXT: v_log_f32_e32 v0, v0
-; GFX78-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42000000
-; GFX78-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX78-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX78-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT: v_exp_f32_e32 v0, v0
-; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x800000
+; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_log_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x42000000
+; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_exp_f32_e32 v0, v0
+; GFX7-NEXT: v_not_b32_e32 v1, 63
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT: v_log_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_powi_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%res = call float @llvm.powi.f32.i32(float %l, i32 %r)
ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
index 5378ce2d1efa..10517a49e697 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
@@ -491,3 +491,132 @@ body: |
%1:_(p5) = G_DYN_STACKALLOC %0, 32
S_ENDPGM 0, implicit %1
...
+
+---
+name: test_dyn_stackalloc_vgpr_align4
+legalized: true
+frameInfo:
+ maxAlignment: 4
+stack:
+ - { id: 0, type: variable-sized, alignment: 4 }
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align4
+ ; WAVE64: liveins: $vgpr0
+ ; WAVE64-NEXT: {{ $}}
+ ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+ ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+ ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+ ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+ ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ ;
+ ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align4
+ ; WAVE32: liveins: $vgpr0
+ ; WAVE32-NEXT: {{ $}}
+ ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+ ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+ ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+ ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+ ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(p5) = G_DYN_STACKALLOC %0, 4
+ S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align16
+legalized: true
+frameInfo:
+ maxAlignment: 16
+stack:
+ - { id: 0, type: variable-sized, alignment: 16 }
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align16
+ ; WAVE64: liveins: $vgpr0
+ ; WAVE64-NEXT: {{ $}}
+ ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+ ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+ ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+ ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+ ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ ;
+ ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align16
+ ; WAVE32: liveins: $vgpr0
+ ; WAVE32-NEXT: {{ $}}
+ ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+ ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+ ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+ ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+ ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(p5) = G_DYN_STACKALLOC %0, 16
+ S_ENDPGM 0, implicit %1
+...
+
+---
+name: test_dyn_stackalloc_vgpr_align64
+legalized: true
+frameInfo:
+ maxAlignment: 64
+stack:
+ - { id: 0, type: variable-sized, alignment: 64 }
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align64
+ ; WAVE64: liveins: $vgpr0
+ ; WAVE64-NEXT: {{ $}}
+ ; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+ ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+ ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+ ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+ ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+ ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+ ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+ ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+ ;
+ ; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align64
+ ; WAVE32: liveins: $vgpr0
+ ; WAVE32-NEXT: {{ $}}
+ ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
+ ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+ ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
+ ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
+ ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+ ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+ ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+ ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+ ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+ ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
+ ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(p5) = G_DYN_STACKALLOC %0, 64
+ S_ENDPGM 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
new file mode 100644
index 000000000000..52259c4c2e6e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit add by a constant if we know the low 32-bits are all
+; zero.
+
+; add i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (add x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 0x40000
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+ ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 1
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add i64 %reg, 4294967296 ; (1 << 32)
+ ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 2
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add i64 %reg, 8589934592 ; (1 << 33)
+ ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+ ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, -1
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+ ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 0x40000, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+ ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add i64 %reg, 4294967296 ; (1 << 32)
+ ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add i64 %reg, 8589934592 ; (1 << 33)
+ ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+ ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+ ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 s0, s0, -1
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add i64 %reg, 4294967295 ; (1 << 31)
+ ret i64 %add
+}
+
+define i64 @v_add_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add i64 %reg, 4294967295 ; (1 << 31)
+ ret i64 %add
+}
+
+define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+ ret <2 x i64> %add
+}
+
+define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_add_u32_e32 v3, 2, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+ ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 1
+; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+ ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 1
+; GFX9-NEXT: s_add_i32 s3, s3, 2
+; GFX9-NEXT: ; return to shader part epilog
+ %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+ ret <2 x i64> %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define i64 @v_add_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_add_i64_variable_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+ %in.high.bits = shl i64 %zext.offset.hi32, 32
+ %add = add i64 %reg, %in.high.bits
+ ret i64 %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define amdgpu_ps i64 @s_add_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_add_i64_variable_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 s0, s0, 0
+; GFX9-NEXT: s_addc_u32 s1, s1, s2
+; GFX9-NEXT: ; return to shader part epilog
+ %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+ %in.high.bits = shl i64 %zext.offset.hi32, 32
+ %add = add i64 %reg, %in.high.bits
+ ret i64 %add
+}
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 5b72795ba07e..b128be2186df 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX7-LABEL: fmul_select_f32_test1:
@@ -21,22 +25,22 @@ define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f32_test1:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test1:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float 2.000000e+00, float 1.000000e+00
%ldexp = fmul float %x, %y
@@ -60,22 +64,22 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f32_test2:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test2:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float 5.000000e-01, float 1.000000e+00
%ldexp = fmul float %x, %y
@@ -83,49 +87,71 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test3:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test3:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test3:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
%ldexp = fmul <2 x float> %x, %y
@@ -133,49 +159,71 @@ define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1
}
define <2 x float> @fmul_select_v2f32_test4(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test4:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test4:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test4:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test4:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x float> <float 5.000000e-01, float 5.000000e-01>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
%ldexp = fmul <2 x float> %x, %y
@@ -199,22 +247,22 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f32_test5:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test5:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test5:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test5:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float -2.000000e+00, float -1.000000e+00
%ldexp = fmul float %x, %y
@@ -222,44 +270,83 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define float @fmul_select_f32_test6(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test6:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT: v_mov_b32_e32 v4, 0xc0400000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test6:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x41000000
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xc0400000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test6:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0xc0400000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test6:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v3, 0xc0400000
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test6:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0400000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test6:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0400000
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test6:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xc0400000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test6:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc0400000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test6:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xc0400000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test6:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x41000000
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xc0400000
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test6:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x41000000
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float -3.000000e+00, float 8.000000e+00
%ldexp = fmul float %x, %y
@@ -285,22 +372,22 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1030-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float 0x43A0000000000000, float 0x45B0000000000000
%ldexp = fmul float %x, %y
@@ -308,44 +395,83 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
}
define float @fmul_select_f32_test8(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test8:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, 0xc1000000
-; GFX7-NEXT: v_mov_b32_e32 v4, 0x41800000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xc1000000
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x41800000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test8:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0x41800000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test8:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v3, 0x41800000
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test8:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xc1000000
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test8:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0xc1000000
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test8:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc1000000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test8:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc1000000
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x41800000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1000000
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x41800000
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1000000
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float 1.600000e+01, float -8.000000e+00
%ldexp = fmul float %x, %y
@@ -369,22 +495,22 @@ define float @fmul_select_f32_test9(float %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f32_test9:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test9:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test9:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test9:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float 0.000000e+00, float 2.000000e+00
%ldexp = fmul float %x, %y
@@ -410,22 +536,22 @@ define float @fmul_select_f32_test10(float %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f32_test10:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test10:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test10:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test10:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float -0.000000e+00, float 0.000000e+00
%ldexp = fmul float %x, %y
@@ -451,22 +577,22 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
; GFX9-NEXT: v_ldexp_f32 v0, -v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1030-NEXT: v_ldexp_f32 v0, -v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f32 v0, -v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX10-NEXT: v_ldexp_f32 v0, -v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f32 v0, -v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float 0xC4D0000000000000, float 0xC370000000000000
%ldexp = fmul float %x, %y
@@ -474,44 +600,83 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
}
define float @fmul_select_f32_test12_sel_log2val_neg48_pos68(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x44
-; GFX7-NEXT: v_not_b32_e32 v4, 47
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x44
-; GFX9-NEXT: v_not_b32_e32 v4, 47
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_not_b32_e32 v3, 47
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1030-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_not_b32_e32 v3, 47
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x44
+; GFX7-SDAG-NEXT: v_not_b32_e32 v4, 47
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_not_b32_e32 v3, 47
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x44
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x44
+; GFX9-SDAG-NEXT: v_not_b32_e32 v4, 47
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_not_b32_e32 v3, 47
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x44
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_not_b32_e32 v3, 47
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x44
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_not_b32_e32 v3, 47
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x44
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, float 0x3CF0000000000000, float 0x4430000000000000
%ldexp = fmul float %x, %y
@@ -535,22 +700,22 @@ define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f64_test1:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test1:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double 2.000000e+00, double 1.000000e+00
%ldexp = fmul double %x, %y
@@ -574,22 +739,22 @@ define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f64_test2:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test2:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double 5.000000e-01, double 1.000000e+00
%ldexp = fmul double %x, %y
@@ -619,28 +784,28 @@ define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.ar
; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_v2f64_test3:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test3:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
%ldexp = fmul <2 x double> %x, %y
@@ -670,28 +835,28 @@ define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.ar
; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_v2f64_test4:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test4:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test4:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX11-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 5.000000e-01>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
%ldexp = fmul <2 x double> %x, %y
@@ -715,22 +880,22 @@ define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f64_test5:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test5:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test5:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test5:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double -5.000000e-01, double -1.000000e+00
%ldexp = fmul double %x, %y
@@ -754,22 +919,22 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f64_test6:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test6:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test6:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test6:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double -2.000000e+00, double -1.000000e+00
%ldexp = fmul double %x, %y
@@ -777,44 +942,64 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
}
define double @fmul_select_f64_test7(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test7:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, 0xbff00000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX7-NEXT: v_mov_b32_e32 v2, 0
-; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test7:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xbff00000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test7:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_mov_b32_e32 v4, 0
-; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test7:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_mov_b32_e32 v4, 0
-; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test7:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xbff00000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test7:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, 0xbff00000
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test7:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xbff00000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test7:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xbff00000
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test7:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test7:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double 2.000000e+00, double -1.000000e+00
%ldexp = fmul double %x, %y
@@ -838,22 +1023,22 @@ define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f64_test8:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test8:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double -4.000000e+00, double -3.200000e+01
%ldexp = fmul double %x, %y
@@ -883,28 +1068,28 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
; GFX9-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_v2f64_test9:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test9:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test9:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test9:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT: v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x double> <double -2.000000e+00, double -2.000000e+00>, <2 x double> <double -1.000000e+00, double -1.000000e+00>
%ldexp = fmul <2 x double> %x, %y
@@ -912,60 +1097,115 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
}
define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f64_test10:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v8, 0xbff00000
-; GFX7-NEXT: v_mov_b32_e32 v9, 0x3fe00000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX7-NEXT: v_mov_b32_e32 v8, 0
-; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX7-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f64_test10:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v8, 0xbff00000
-; GFX9-NEXT: v_mov_b32_e32 v9, 0x3fe00000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f64_test10:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0x3fe00000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT: v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0
-; GFX1030-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1030-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test10:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v8, 0x3fe00000
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1100-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v8, 0xbff00000
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v8, 0
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v8, 0xbff00000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v8, 0x3fe00000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v9, 0xbff00000
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, 0x3fe00000
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v9, 0xbff00000 :: v_dual_mov_b32 v8, 0
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 2.000000e+00>, <2 x double> <double -1.000000e+00, double 1.000000e+00>
%ldexp = fmul <2 x double> %x, %y
@@ -973,44 +1213,64 @@ define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.a
}
define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test11:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX7-NEXT: v_mov_b32_e32 v2, 0
-; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test11:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test11:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_mov_b32_e32 v4, 0
-; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test11:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_mov_b32_e32 v4, 0
-; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test11:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test11:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test11:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test11:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test11:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test11:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double -2.000000e+00, double -0.000000e+00
%ldexp = fmul double %x, %y
@@ -1018,45 +1278,84 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2)
}
define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test12:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 31, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, 0
-; GFX7-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test12:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 31, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test12:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 31, v3
-; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test12:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
-; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test12:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v2
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test12:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX7-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test12:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test12:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test12:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 31, v3
+; GFX10-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test12:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test12:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
+; GFX11-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test12:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double 0.000000e+00, double -0.000000e+00
%ldexp = fmul double %x, %y
@@ -1084,24 +1383,24 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f64_test13:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_mov_b32_e32 v4, 0
-; GFX1030-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1030-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test13:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: v_mov_b32_e32 v4, 0
-; GFX1100-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test13:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test13:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double 0.000000e+00, double 1.600000e+01
%ldexp = fmul double %x, %y
@@ -1109,44 +1408,83 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
}
define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_not_b32_e32 v4, 26
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x5c
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_not_b32_e32 v4, 26
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x5c
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v4, 0x5c
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v4, 0x5c
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_not_b32_e32 v4, 26
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v5, 0x5c
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x5c
+; GFX7-GISEL-NEXT: v_not_b32_e32 v5, 26
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_not_b32_e32 v4, 26
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x5c
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5c
+; GFX9-GISEL-NEXT: v_not_b32_e32 v5, 26
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x5c
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_not_b32_e32 v4, 26
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0x5c
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_not_b32_e32 v4, 26
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double 0x45B0000000000000, double 0x3E40000000000000
%ldexp = fmul double %x, %y
@@ -1154,44 +1492,83 @@ define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bo
}
define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_not_b32_e32 v4, 32
-; GFX7-NEXT: v_not_b32_e32 v5, 41
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_not_b32_e32 v4, 32
-; GFX9-NEXT: v_not_b32_e32 v5, 41
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_not_b32_e32 v4, 41
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1030-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_not_b32_e32 v4, 41
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1100-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_not_b32_e32 v4, 32
+; GFX7-SDAG-NEXT: v_not_b32_e32 v5, 41
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_not_b32_e32 v4, 41
+; GFX7-GISEL-NEXT: v_not_b32_e32 v5, 32
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_not_b32_e32 v4, 32
+; GFX9-SDAG-NEXT: v_not_b32_e32 v5, 41
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_not_b32_e32 v4, 41
+; GFX9-GISEL-NEXT: v_not_b32_e32 v5, 32
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_not_b32_e32 v4, 41
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_not_b32_e32 v4, 32
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_not_b32_e32 v4, 41
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_not_b32_e32 v4, 32
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, double 0x3D50000000000000, double 0x3DE0000000000000
%ldexp = fmul double %x, %y
@@ -1200,40 +1577,82 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo
define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test1:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test1:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test1:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test1:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test1:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test1:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test1:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test1:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test1:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half 2.000000e+00, half 1.000000e+00
%ldexp = fmul half %x, %y
@@ -1241,47 +1660,89 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test2:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-NEXT: s_movk_i32 s4, 0x8000
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test2:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: s_movk_i32 s4, 0x8000
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test2:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_movk_i32 s0, 0x8000
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test2:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test2:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test2:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test2:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-SDAG-NEXT: v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test2:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test2:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test2:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half 5.000000e-01, half 1.000000e+00
%ldexp = fmul half %x, %y
@@ -1289,59 +1750,126 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test3:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test3:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test3:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT: v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
%ldexp = fmul <2 x half> %x, %y
@@ -1349,59 +1877,126 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
}
define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test4:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT: v_mov_b32_e32 v6, 0x3800
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test4:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v5, 0x3800
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test4:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v5, 0x3800
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT: v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x3800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3800
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x half> <half 5.000000e-01, half 5.000000e-01>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
%ldexp = fmul <2 x half> %x, %y
@@ -1409,15 +2004,25 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
}
define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test5:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test5:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test5:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_f16_test5:
; GFX9: ; %bb.0:
@@ -1427,22 +2032,22 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f16_test5:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test5:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test5:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test5:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half 2.000000e+00, half 8.000000e+00
%ldexp = fmul half %x, %y
@@ -1450,46 +2055,88 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test6:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test6:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4200
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xc800
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test6:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0xc800
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test6:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v3, 0xc800
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test6:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test6:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0xc800
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x4200
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test6:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4200
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xc800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test6:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc800
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4200
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test6:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xc800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test6:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4200
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0xc800
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test6:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4200
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half -8.000000e+00, half 3.000000e+00
%ldexp = fmul half %x, %y
@@ -1497,45 +2144,87 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test7:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test7:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xc400
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x4800
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test7:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4800
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test7:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v3, 0x4800
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test7:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test7:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x4800
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0xc400
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test7:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test7:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4800
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc400
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test7:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test7:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xc400
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test7:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x4800
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test7:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc400
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half 8.000000e+00, half -4.000000e+00
%ldexp = fmul half %x, %y
@@ -1543,16 +2232,28 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test8:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test8:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test8:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_f16_test8:
; GFX9: ; %bb.0:
@@ -1563,22 +2264,22 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1030-LABEL: fmul_select_f16_test8:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1030-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test8:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half -0.000000e+00, half 0.000000e+00
%ldexp = fmul half %x, %y
@@ -1586,40 +2287,87 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test9:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test9:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX9-NEXT: v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test9:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1030-NEXT: v_ldexp_f16_e64 v0, -v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test9:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_ldexp_f16_e64 v0, -v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test9:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test9:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test9:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test9:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test9:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test9:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test9:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test9:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half -1.600000e+01, half -3.200000e+01
%ldexp = fmul half %x, %y
@@ -1627,47 +2375,82 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX9-NEXT: s_movk_i32 s4, 0x8000
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: s_movk_i32 s4, 0x8000
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_movk_i32 s0, 0x8000
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-SDAG-NEXT: v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half 0xH1000, half 0xH6800
%ldexp = fmul half %x, %y
@@ -1675,47 +2458,82 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a
}
define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX9-NEXT: s_movk_i32 s4, 0x8000
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT: v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: s_movk_i32 s4, 0x8000
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1030-NEXT: v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_movk_i32 s0, 0x8000
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-SDAG-NEXT: v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, half 0xH5800, half 0xH0400
%ldexp = fmul half %x, %y
@@ -1723,72 +2541,114 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
}
define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test1:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test1:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test1:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
%ldexp = fmul bfloat %x, %y
@@ -1796,72 +2656,114 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
}
define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test2:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f00
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test2:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0x3f00
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test2:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3f00
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f00
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
%ldexp = fmul bfloat %x, %y
@@ -1869,111 +2771,158 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
}
define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test3:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test3:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test3:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test3:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
%ldexp = fmul <2 x bfloat> %x, %y
@@ -1981,111 +2930,158 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
}
define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test4:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT: v_mov_b32_e32 v6, 0x3f00
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT: s_mov_b32 s4, 0x7060302
-; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test4:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v5, 0x3f00
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test4:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_mov_b32_e32 v5, 0x3f00
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3f00
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
%y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
%ldexp = fmul <2 x bfloat> %x, %y
@@ -2093,73 +3089,108 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
}
define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test5:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test5:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4100
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test5:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test5:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4100
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
%ldexp = fmul bfloat %x, %y
@@ -2167,74 +3198,116 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
}
define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test6:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test6:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4040
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc100
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test6:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0xffffc100
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test6:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0xc100
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x4040
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4040
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc100
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc100
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4040
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc100
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
%ldexp = fmul bfloat %x, %y
@@ -2242,73 +3315,115 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
}
define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test7:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test7:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc080
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x4100
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test7:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0x4100
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test7:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x4100
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0xc080
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc080
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4100
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4100
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc080
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4100
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
%ldexp = fmul bfloat %x, %y
@@ -2316,73 +3431,111 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
}
define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test8:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, 15
-; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test8:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b16 v1, 15, v1
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test8:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_lshlrev_b16 v1, 15, v1
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 15
+; GFX9-SDAG-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 15, v1
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b16 v1, 15, v1
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
%ldexp = fmul bfloat %x, %y
@@ -2390,74 +3543,121 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
}
define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test9:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, 0xc2000000
-; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1800000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test9:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc200
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc180
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test9:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0xffffc180
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test9:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2000000
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1800000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc200
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc180
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc180
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
%ldexp = fmul bfloat %x, %y
@@ -2465,74 +3665,111 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
}
define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, 0xdb800000
-; GFX7-NEXT: v_bfrev_b32_e32 v4, 7
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffdb80
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffe000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0xffffe000
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xdb800000
+; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v4, 7
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x41
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffdb80
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffe000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffe000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
%ldexp = fmul bfloat %x, %y
@@ -2540,74 +3777,111 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
}
define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_bfrev_b32_e32 v3, 50
-; GFX7-NEXT: v_mov_b32_e32 v4, 0x34800000
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4c00
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x3480
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0x3480
-; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1100-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 50
+; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x34800000
+; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_not_b32_e32 v3, 21
+; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4c00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3480
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_not_b32_e32 v3, 21
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x3480
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%bool = icmp eq i32 %bool.arg1, %bool.arg2
%y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
%ldexp = fmul bfloat %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir
new file mode 100644
index 000000000000..5c7c07632f0d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dummy-regalloc-priority-advisor.mir
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=default -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,2 -stress-regalloc=4 -stop-after=virtregrewriter,2 -regalloc-enable-priority-advisor=dummy -o - %s | FileCheck -check-prefixes=CHECK,DUMMY %s
+
+# Check that the regalloc-enable-priority-advisor=dummy option works
+# and the result is different from the default. Ordinarily %1 would be
+# prioritized higher than %0 due to the register class priority
+
+---
+name: foo
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vreg_128 }
+ - { id: 2, class: vgpr_32 }
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; DEFAULT-LABEL: name: foo
+ ; DEFAULT: liveins: $vgpr0, $vgpr1
+ ; DEFAULT-NEXT: {{ $}}
+ ; DEFAULT-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+ ; DEFAULT-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; DEFAULT-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+ ; DEFAULT-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+ ; DEFAULT-NEXT: renamable $vgpr3 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; DEFAULT-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec
+ ; DEFAULT-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; DUMMY-LABEL: name: foo
+ ; DUMMY: liveins: $vgpr0, $vgpr1
+ ; DUMMY-NEXT: {{ $}}
+ ; DUMMY-NEXT: SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+ ; DUMMY-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+ ; DUMMY-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+ ; DUMMY-NEXT: renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+ ; DUMMY-NEXT: renamable $vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+ ; DUMMY-NEXT: renamable $vgpr3 = V_ADD_U32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
+ ; DUMMY-NEXT: SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+ undef %1.sub0:vreg_128 = COPY $vgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
+ %2:vgpr_32 = V_ADD_U32_e32 %1.sub0, %0, implicit $exec
+ $vgpr3 = COPY %2
+ SI_RETURN implicit $vgpr3, implicit $vgpr0, implicit $vgpr1
+
+...
+
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 73aa87e5c55d..9acb3a42ae10 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1,64 +1,829 @@
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-GISEL %s
target datalayout = "A5"
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-SDAG-NEXT: s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT: s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, addrspace(5)
store volatile i32 123, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_add_i32 s5, s32, 0x1fff
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 10
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 10
+; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 128, addrspace(5)
store volatile i32 10, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 6
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 22
+; GFX11-SDAG-NEXT: s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_uniform_under_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT: s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s0, s32
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1
+; GFX11-GISEL-NEXT: s_endpgm
%alloca = alloca i32, i32 %n, align 2, addrspace(5)
store volatile i32 22, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%alloca = alloca float, i32 %idx, addrspace(5)
store volatile i32 123, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT: s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%alloca = alloca i32, i32 %idx, align 128, addrspace(5)
store volatile i32 444, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 4, 15
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s32, 16
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX11-SDAG-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s32, 16
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 4, 15
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%alloca = alloca i128, i32 %idx, align 2, addrspace(5)
store volatile i32 666, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT: s_add_i32 s32, s9, s5
+; GFX9-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s5, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s5
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s5
+; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX9-SDAG-NEXT: ; %bb.3:
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 2
+; GFX9-SDAG-NEXT: s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT: s_lshl_b32 s4, s4, 6
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s32
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT: s_add_i32 s32, s5, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT: s_lshl2_add_u32 s5, s5, 15
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, -16
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s5, 6
+; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s6
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX9-GISEL-NEXT: ; %bb.3:
+; GFX9-GISEL-NEXT: s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: .LBB6_4: ; %bb.1
+; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s4, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s5, s32
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_mov_b32 s2, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_add_i32 s1, s1, 15
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4
+; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: s_mov_b32 s3, s32
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s3
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s3 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_4
+; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15
+; GFX11-GISEL-NEXT: s_add_u32 s3, s32, 0x7ff
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_lshl_b32 s4, s1, 5
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_and_b32 s1, s3, 0xfffff800
+; GFX11-GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s4
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s5, v0, s4
+; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX11-GISEL-NEXT: ; %bb.3:
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-GISEL-NEXT: s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s2, s2, 5
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s3 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_u32 s32, s3, s2
+; GFX11-GISEL-NEXT: .LBB6_4: ; %bb.1
+; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s0, 15
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: s_endpgm
entry:
%cond = icmp eq i32 %n, 0
%alloca1 = alloca i32, i32 8, addrspace(5)
@@ -77,10 +842,206 @@ bb.1:
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
+; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s8
+; GFX9-SDAG-NEXT: s_max_u32 s4, s4, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX9-SDAG-NEXT: ; %bb.3:
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s4, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5
+; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0
+; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 2
+; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff
+; GFX9-SDAG-NEXT: s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16
+; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT: s_add_i32 s32, s4, s5
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: .LBB7_5: ; %bb.2
+; GFX9-SDAG-NEXT: s_endpgm
+; GFX9-SDAG-NEXT: .LBB7_6:
+; GFX9-SDAG-NEXT: s_branch .LBB7_4
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x1000
+; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB7_4
+; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[6:7]
+; GFX9-GISEL-NEXT: v_readlane_b32 s9, v0, s4
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s4
+; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s9
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX9-GISEL-NEXT: ; %bb.3:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s6
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX9-GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX9-GISEL-NEXT: s_xor_b32 s4, s4, 1
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX9-GISEL-NEXT: ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s5, 15
+; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0xfff
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xfffff000
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: .LBB7_6: ; %bb.2
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s32, 64
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s4
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s2
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5
+; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800
+; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2
+; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-NEXT: .LBB7_6:
+; GFX11-SDAG-NEXT: s_branch .LBB7_4
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s32, 64
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 1
+; GFX11-GISEL-NEXT: s_cbranch_scc0 .LBB7_4
+; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT: s_bitset0_b32 s0, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s4
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX11-GISEL-NEXT: ; %bb.3:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s2, 5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s3, s0
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s3 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX11-GISEL-NEXT: s_xor_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX11-GISEL-NEXT: ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s1, 15
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0x7ff
+; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff800
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: .LBB7_6: ; %bb.2
+; GFX11-GISEL-NEXT: s_endpgm
entry:
%cond = icmp eq i32 %n, 0
br i1 %cond, label %bb.0, label %bb.1
@@ -97,62 +1058,1113 @@ bb.2:
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, i32 %n, addrspace(5)
store volatile i32 123, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xffffe000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10
+; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 10
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10
+; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff000
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 10
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, i32 %n, align 128, addrspace(5)
store volatile i32 10, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 22
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 22
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB10_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, i32 %n, align 2, addrspace(5)
store volatile i32 22, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_divergent() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB11_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%alloca = alloca i32, i32 %idx, addrspace(5)
store volatile i32 123, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s33
+; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s7, 0
+; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
+; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s10
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000
+; GFX9-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000
+; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100
+; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT: s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000
+; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB12_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%alloca = alloca i32, i32 %idx, align 128, addrspace(5)
store volatile i32 444, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB13_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%alloca = alloca i32, i32 %idx, align 2, addrspace(5)
store volatile i32 666, ptr addrspace(5) %alloca
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s13, s33
+; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT: s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x3000
+; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6
+; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s11, v1, s9
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
+; GFX9-SDAG-NEXT: ; %bb.3:
+; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s10, 0
+; GFX9-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s11, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s12, v1, s11
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s11
+; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s12
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
+; GFX9-SDAG-NEXT: ; %bb.5:
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-SDAG-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s6
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1
+; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s7, v0, s6
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s6
+; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s7
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
+; GFX9-SDAG-NEXT: ; %bb.8:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xd000
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s13
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_mov_b32 s13, s33
+; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT: s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x3000
+; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT: s_cbranch_execz .LBB14_6
+; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.0
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v31
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s9, 0
+; GFX9-GISEL-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7]
+; GFX9-GISEL-NEXT: v_readlane_b32 s11, v1, s10
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s10
+; GFX9-GISEL-NEXT: s_max_u32 s9, s9, s11
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_2
+; GFX9-GISEL-NEXT: ; %bb.3:
+; GFX9-GISEL-NEXT: s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s9, 6
+; GFX9-GISEL-NEXT: s_and_b32 s9, s7, 0xfffff000
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v2, 2, 15
+; GFX9-GISEL-NEXT: s_add_u32 s32, s9, s6
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s10, 0
+; GFX9-GISEL-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s11, s[6:7]
+; GFX9-GISEL-NEXT: v_readlane_b32 s12, v1, s11
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s11
+; GFX9-GISEL-NEXT: s_max_u32 s10, s10, s12
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_4
+; GFX9-GISEL-NEXT: ; %bb.5:
+; GFX9-GISEL-NEXT: s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-GISEL-NEXT: s_lshl_b32 s7, s10, 6
+; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: .LBB14_6: ; %bb.1
+; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s6, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s7, v0, s6
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s6
+; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s7
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_7
+; GFX9-GISEL-NEXT: ; %bb.8:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s8, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xd000
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s13
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s33
+; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xc0
+; GFX11-SDAG-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6
+; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4
+; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
+; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1
+; GFX11-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5
+; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
+; GFX11-SDAG-NEXT: ; %bb.5:
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s3, 5, s4
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s2 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s4 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v0, 2, 15
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
+; GFX11-SDAG-NEXT: ; %bb.8:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 1
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s0, 5, s1
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s7
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff40
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_mov_b32 s7, s33
+; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xc0
+; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-GISEL-NEXT: s_cbranch_execz .LBB14_6
+; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.0
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v2, v1, 2, 15
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v31
+; GFX11-GISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX11-GISEL-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s5, v2, s4
+; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_2
+; GFX11-GISEL-NEXT: ; %bb.3:
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-GISEL-NEXT: s_lshl_b32 s5, s2, 5
+; GFX11-GISEL-NEXT: s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s5
+; GFX11-GISEL-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s5, s4
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX11-GISEL-NEXT: s_bitset0_b32 s4, s5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s3, s3, s6
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_4
+; GFX11-GISEL-NEXT: ; %bb.5:
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s3, s3, 5
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s4 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_u32 s32, s4, s3
+; GFX11-GISEL-NEXT: .LBB14_6: ; %bb.1
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_7
+; GFX11-GISEL-NEXT: ; %bb.8:
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s7
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff40
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%cond = icmp eq i32 %n, 0
%alloca1 = alloca i32, i32 8, addrspace(5)
@@ -171,10 +2183,272 @@ bb.1:
ret void
}
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
-
define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s33
+; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT: s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_4
+; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s10, v1, s9
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
+; GFX9-SDAG-NEXT: ; %bb.3:
+; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0xfffff000
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s8, 6, v1
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2
+; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31
+; GFX9-SDAG-NEXT: .LBB15_4: ; %Flow
+; GFX9-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_8
+; GFX9-SDAG-NEXT: ; %bb.5: ; %bb.0
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s8, 0
+; GFX9-SDAG-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7]
+; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s9
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9
+; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
+; GFX9-SDAG-NEXT: ; %bb.7:
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2
+; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xe000
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s11
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_mov_b32 s11, s33
+; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT: s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x2000
+; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_4
+; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.1
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB15_2
+; GFX9-GISEL-NEXT: ; %bb.3:
+; GFX9-GISEL-NEXT: s_add_u32 s7, s32, 0xfff
+; GFX9-GISEL-NEXT: s_and_b32 s7, s7, 0xfffff000
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s8, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT: s_add_u32 s32, s7, s6
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: ; implicit-def: $vgpr31
+; GFX9-GISEL-NEXT: .LBB15_4: ; %Flow
+; GFX9-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_8
+; GFX9-GISEL-NEXT: ; %bb.5: ; %bb.0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s8, 0
+; GFX9-GISEL-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
+; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9
+; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB15_6
+; GFX9-GISEL-NEXT: ; %bb.7:
+; GFX9-GISEL-NEXT: s_mov_b32 s6, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, 6
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: .LBB15_8: ; %bb.2
+; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xe000
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s11
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT: s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x80
+; GFX11-SDAG-NEXT: v_cmpx_ne_u32_e32 0, v0
+; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_4
+; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3
+; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
+; GFX11-SDAG-NEXT: ; %bb.3:
+; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s1, 5, s2
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow
+; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8
+; GFX11-SDAG-NEXT: ; %bb.5: ; %bb.0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
+; GFX11-SDAG-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
+; GFX11-SDAG-NEXT: ; %bb.7:
+; GFX11-SDAG-NEXT: s_mov_b32 s2, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff80
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_mov_b32 s5, s33
+; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT: s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x80
+; GFX11-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v0
+; GFX11-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_4
+; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.1
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_2
+; GFX11-GISEL-NEXT: ; %bb.3:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-GISEL-NEXT: s_add_u32 s2, s32, 0x7ff
+; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xfffff800
+; GFX11-GISEL-NEXT: ; implicit-def: $vgpr31
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: .LBB15_4: ; %Flow
+; GFX11-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_8
+; GFX11-GISEL-NEXT: ; %bb.5: ; %bb.0
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
+; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_6
+; GFX11-GISEL-NEXT: ; %bb.7:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-GISEL-NEXT: s_mov_b32 s2, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s1
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: .LBB15_8: ; %bb.2
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff80
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s5
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%cond = icmp eq i32 %n, 0
br i1 %cond, label %bb.0, label %bb.1
@@ -190,3 +2464,257 @@ bb.1:
bb.2:
ret void
}
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0
+; GFX11-SDAG-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB16_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %alloca = alloca i32, i16 %n, align 2, addrspace(5)
+ store volatile i32 666, ptr addrspace(5) %alloca
+ ret void
+}
+
+define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 %n) {
+; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400
+; GFX9-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX9-SDAG-NEXT: ; %bb.2:
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s32
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT: s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-GISEL-NEXT: s_mov_b32 s9, s33
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400
+; GFX9-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7
+; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX9-GISEL-NEXT: ; %bb.2:
+; GFX9-GISEL-NEXT: s_mov_b32 s4, s32
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6
+; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT: s_mov_b32 s33, s9
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
+; GFX11-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
+; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX11-SDAG-NEXT: ; %bb.2:
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT: s_mov_b32 s4, s33
+; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s32
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16
+; GFX11-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
+; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB17_1
+; GFX11-GISEL-NEXT: ; %bb.2:
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a
+; GFX11-GISEL-NEXT: s_mov_b32 s1, s32
+; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5
+; GFX11-GISEL-NEXT: s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %alloca = alloca i32, i64 %n, align 2, addrspace(5)
+ store volatile i32 666, ptr addrspace(5) %alloca
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index ebfb5e9ccaa3..a324ba35b155 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1625,14 +1625,12 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; CODEGEN-IEEE-GISEL: ; %bb.0:
; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x4b800000
; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
+; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; CODEGEN-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0
-; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x45800000
-; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
+; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 97d642b991f7..5415af02ef89 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -5249,6 +5249,114 @@ bb:
ret void
}
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_add_u32_e32 v0, -16, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-NEXT: scratch_store_dword v0, v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT: v_add3_u32 v0, s2, s3, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-NEXT: scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: v_add3_u32 v0, s0, s1, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, 15
+; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s8
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1
+; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-PAL-NEXT: v_add_u32_e32 v0, -16, v0
+; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940: ; %bb.0: ; %bb
+; GFX940-NEXT: s_add_i32 s0, s0, s1
+; GFX940-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX940-NEXT: v_add_u32_e32 v0, -16, v0
+; GFX940-NEXT: v_mov_b32_e32 v1, 15
+; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s8
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: v_add3_u32 v0, s0, s1, v0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:-16
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11-PAL: ; %bb.0: ; %bb
+; GFX11-PAL-NEXT: v_add3_u32 v0, s0, s1, v0
+; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT: s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12-PAL: ; %bb.0: ; %bb
+; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-PAL-NEXT: s_add_co_i32 s0, s0, s1
+; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-PAL-NEXT: s_wait_storecnt 0x0
+; GFX12-PAL-NEXT: s_endpgm
+bb:
+ %add1 = add nsw i32 %sidx, %vidx
+ %add2 = add nsw i32 %add1, -16
+ %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+ store volatile i32 15, ptr addrspace(5) %gep, align 4
+ ret void
+}
+
define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
; GFX9-LABEL: sgpr_base_negative_offset:
; GFX9: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 005e40159f61..822d40f7349b 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -5,6 +5,8 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
declare half @llvm.fma.f16(half, half, half)
declare half @llvm.maxnum.f16(half, half)
@@ -27,6 +29,16 @@ define half @test_fma(half %x, half %y, half %z) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fma:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %r
}
@@ -50,6 +62,16 @@ define half @test_fmac(half %x, half %y, half %z) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmac:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %y, half %z, half %x)
ret half %r
}
@@ -81,6 +103,16 @@ define half @test_fmaak(half %x, half %y, half %z) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmaak:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
ret half %r
}
@@ -112,6 +144,16 @@ define half @test_fmamk(half %x, half %y, half %z) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_fmamk:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
ret half %r
}
@@ -193,6 +235,42 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: test_D139469_f16:
+; GFX12-SDAG: ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX12-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-SDAG-NEXT: v_min_num_f16_e32 v0, v2, v1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_D139469_f16:
+; GFX12-GISEL: ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX12-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = fmul contract half %arg, 0xH291E
%i1 = fcmp olt half %i, 0xH0000
@@ -306,6 +384,55 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: test_D139469_v2f16:
+; GFX12-SDAG: ; %bb.0: ; %bb
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x211e
+; GFX12-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_pk_min_num_f16 v0, v1, v0
+; GFX12-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_D139469_v2f16:
+; GFX12-GISEL: ; %bb.0: ; %bb
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX12-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
+; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_or_b32 s0, s1, s2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
%i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0dbbaea9..fbcdbed338e6 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e044df..269fd52df5c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX11-NEXT: v_pk_min_f16 v0, v2, v0
; GFX11-NEXT: v_pk_min_f16 v0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
%min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s6
+; GFX12-NEXT: s_mov_b32 s13, s7
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
%c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s6
+; GFX12-NEXT: s_mov_b32 s13, s7
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
%c = load volatile double, ptr addrspace(1) %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 104e157e9e15..9ae60f99d5e0 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -3307,489 +3307,459 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
; --------------------------------------------------------------------
define float @v_mul_f32_select_64_1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f32_select_64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_64_1:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
+ %cond = icmp eq i32 %arg, 0
+ %select.pow2 = select i1 %cond, float 64.0, float 1.0
+ %mul = fmul float %x, %select.pow2
+ ret float %mul
+}
+
+define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_1_64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_1_64:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
+ %cond = icmp eq i32 %arg, 0
+ %select.pow2 = select i1 %cond, float 1.0, float 64.0
+ %mul = fmul float %x, %select.pow2
+ ret float %mul
+}
+
+define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n1_n64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n1_n64:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
+ %cond = icmp eq i32 %arg, 0
+ %select.pow2 = select i1 %cond, float -1.0, float -64.0
+ %mul = fmul float %x, %select.pow2
+ ret float %mul
+}
+
+define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n64_n1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n64_n1:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float 64.0, float 1.0
+ %select.pow2 = select i1 %cond, float -64.0, float -1.0
%mul = fmul float %x, %select.pow2
ret float %mul
}
-define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_1_64:
+define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc
; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 1.0, vcc
-; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float 1.0, float 64.0
+ %select.pow2 = select i1 %cond, float 128.0, float 64.0
%mul = fmul float %x, %select.pow2
ret float %mul
}
-define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n1_n64:
+define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc
; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1.0, vcc
-; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float -1.0, float -64.0
+ %select.pow2 = select i1 %cond, float -128.0, float -64.0
%mul = fmul float %x, %select.pow2
ret float %mul
}
-define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n64_n1:
+define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n128_n16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_mul_f32_select_n128_n16:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
+ %cond = icmp eq i32 %arg, 0
+ %select.pow2 = select i1 %cond, float -128.0, float -16.0
+ %mul = fmul float %x, %select.pow2
+ ret float %mul
+}
+
+define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1.0, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float -64.0, float -1.0
- %mul = fmul float %x, %select.pow2
- ret float %mul
+ %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
+ %mul = fmul contract float %x, %select.pow2
+ %fma = fadd contract float %mul, %y
+ ret float %fma
}
-define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
+define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, 1.0, vcc
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x43000000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float 128.0, float 64.0
- %mul = fmul float %x, %select.pow2
- ret float %mul
+ %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
+ %mul = fmul contract float %x, %select.pow2
+ %fma = fadd contract float %mul, %y
+ ret float %fma
}
-define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
+define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2800000
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, -1.0, v3, vcc
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2800000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2800000
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float -128.0, float -64.0
- %mul = fmul float %x, %select.pow2
- ret float %mul
+ %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
+ %mul = fmul contract float %x, %select.pow2
+ %fma = fadd contract float %mul, %y
+ ret float %fma
}
-define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n16:
+define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2800000
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, -1.0, vcc
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1800000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xc1800000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xc1800000
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float -128.0, float -16.0
- %mul = fmul float %x, %select.pow2
- ret float %mul
-}
-
-define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- %cond = icmp eq i32 %arg, 0
- %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
- %mul = fmul contract float %x, %select.pow2
- %fma = fadd contract float %mul, %y
- ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 1.0, vcc
-; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- %cond = icmp eq i32 %arg, 0
- %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
- %mul = fmul contract float %x, %select.pow2
- %fma = fadd contract float %mul, %y
- ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, -1.0, v3, vcc
-; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT: v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- %cond = icmp eq i32 %arg, 0
- %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
- %mul = fmul contract float %x, %select.pow2
- %fma = fadd contract float %mul, %y
- ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, -1.0, vcc
-; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- %cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, float -1.0, float -64.0
%mul = fmul contract float %x, %select.pow2
%fma = fadd contract float %mul, %y
@@ -3810,11 +3780,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x43000000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x42800000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f32 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3829,10 +3799,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f32 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3847,10 +3818,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f32 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, float 128.0, float 64.0
@@ -3860,22 +3832,57 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
}
define float @v_contract_mul_add_f32_select_128_4(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 4.0, v3, vcc
-; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x43000000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 4.0, v3, vcc
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
-; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, float 128.0, float 4.0
%mul = fmul contract float %x, %select.pow2
@@ -3907,143 +3914,102 @@ define float @v_contract_mul_add_f32_select_2_4(i32 %arg, float %x, float %y) {
}
define float @v_contract_mul_add_f32_select_4_128(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 4.0, vcc
-; GFX9-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
-; GFX1011-NEXT: v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
- %cond = icmp eq i32 %arg, 0
- %select.pow2 = select i1 %cond, float 4.0, float 128.0
- %mul = fmul contract float %x, %select.pow2
- %fma = fadd contract float %mul, %y
- ret float %fma
-}
-
-define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x43000000
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v3, 4.0, vcc
+; GFX9-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x40500000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x3ff00000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX10-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x3ff00000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX11-SDAG-NEXT: v_fma_f32 v0, v1, v0, v2
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
+ %select.pow2 = select i1 %cond, float 4.0, float 128.0
+ %mul = fmul contract float %x, %select.pow2
+ %fma = fadd contract float %mul, %y
+ ret float %fma
+}
+
+define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
+; GFX9-LABEL: v_mul_f64_select_64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_mul_f64_select_64_1:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
+ %cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double 64.0, double 1.0
%mul = fmul double %x, %select.pow2
ret double %mul
}
define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3ff00000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40500000
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x40500000
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_1_64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_1_64:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double 1.0, double 64.0
%mul = fmul double %x, %select.pow2
@@ -4051,59 +4017,21 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
}
define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xbff00000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0500000
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0500000
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_n1_n64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n1_n64:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double -1.0, double -64.0
%mul = fmul double %x, %select.pow2
@@ -4122,12 +4050,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
; GFX9-GISEL-LABEL: v_mul_f64_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x40600000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x40500000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4141,11 +4067,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
; GFX10-GISEL-LABEL: v_mul_f64_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x40500000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4159,10 +4084,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
; GFX11-GISEL-LABEL: v_mul_f64_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4182,12 +4107,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0500000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4201,11 +4124,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0500000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4219,10 +4141,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double -128.0, double -64.0
@@ -4231,59 +4153,21 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
}
define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xc0300000
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xc0300000
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_n128_n16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0xc0300000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n128_n16:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double -128.0, double -16.0
%mul = fmul double %x, %select.pow2
@@ -4305,12 +4189,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40500000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x3ff00000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4326,11 +4208,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4345,10 +4226,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x3ff00000 :: v_dual_mov_b32 v5, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, double 64.0, double 1.0
@@ -4372,12 +4253,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40500000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4393,11 +4272,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40500000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4412,10 +4290,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, double 1.0, double 64.0
@@ -4439,12 +4317,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0xc0500000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0xbff00000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4460,11 +4336,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0xbff00000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4479,10 +4354,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0xbff00000 :: v_dual_mov_b32 v5, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, double -64.0, double -1.0
@@ -4506,12 +4381,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0xbff00000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0xc0500000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4527,11 +4400,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0xc0500000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4546,10 +4418,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0xc0500000 :: v_dual_mov_b32 v5, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, double -1.0, double -64.0
@@ -4573,12 +4445,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40500000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4594,11 +4465,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40500000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4613,10 +4484,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4640,12 +4512,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40100000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4661,11 +4531,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40100000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4680,10 +4549,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40100000 :: v_dual_mov_b32 v5, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double 128.0, double 4.0
@@ -4706,21 +4575,50 @@ define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y)
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40100000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 2.0, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1011-LABEL: v_contract_mul_add_f64_select_2_4:
-; GFX1011: ; %bb.0:
-; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT: v_mov_b32_e32 v5, 0
-; GFX1011-NEXT: v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
-; GFX1011-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
-; GFX1011-NEXT: s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX10-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX11-SDAG-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double 2.0, double 4.0
%mul = fmul contract double %x, %select.pow2
@@ -4743,12 +4641,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0x40100000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v7, 0x40600000
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4764,11 +4660,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, 0x40600000
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4783,10 +4678,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, double 4.0, double 128.0
@@ -4796,57 +4691,21 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
}
define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_64_1:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half 64.0, half 1.0
%mul = fmul half %x, %select.pow2
@@ -4854,57 +4713,21 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
}
define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_1_64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_1_64:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half 1.0, half 64.0
%mul = fmul half %x, %select.pow2
@@ -4912,57 +4735,21 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
}
define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xbc00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n1_n64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n1_n64:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half -1.0, half -64.0
%mul = fmul half %x, %select.pow2
@@ -4981,11 +4768,13 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
; GFX9-GISEL-LABEL: v_mul_f16_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x5800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -4999,10 +4788,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
; GFX10-GISEL-LABEL: v_mul_f16_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -5016,10 +4807,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
; GFX11-GISEL-LABEL: v_mul_f16_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x5400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5039,11 +4832,13 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5057,10 +4852,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5074,10 +4871,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xd400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half -128.0, half -64.0
@@ -5086,57 +4885,21 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
}
define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xcc00
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0xcc00
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT: v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n128_n16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0xcc00
-; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n128_n16:
+; GFX1011: ; %bb.0:
+; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half -128.0, half -16.0
%mul = fmul half %x, %select.pow2
@@ -5157,11 +4920,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5176,10 +4938,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5194,10 +4956,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, half 64.0, half 1.0
@@ -5220,11 +4982,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5239,10 +5000,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5257,10 +5018,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, half 1.0, half 64.0
@@ -5283,11 +5044,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5302,10 +5062,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5320,10 +5080,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, half -64.0, half -1.0
@@ -5346,11 +5106,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5365,10 +5124,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5383,10 +5142,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select contract i1 %cond, half -1.0, half -64.0
@@ -5409,11 +5168,14 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5428,10 +5190,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5446,10 +5211,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5472,11 +5240,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5491,10 +5258,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5509,10 +5276,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half 128.0, half 4.0
@@ -5535,11 +5302,14 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5554,10 +5324,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5572,10 +5345,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half 2.0, half 4.0
@@ -5598,11 +5374,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5617,10 +5392,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX10-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5635,10 +5410,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX11-GISEL-NEXT: v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0, v0, v2
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq i32 %arg, 0
%select.pow2 = select i1 %cond, half 4.0, half 128.0
@@ -5664,15 +5439,13 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
; GFX9-GISEL-LABEL: v_mul_v2f16_select_64_1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5690,14 +5463,14 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
; GFX10-GISEL-LABEL: v_mul_v2f16_select_64_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5715,14 +5488,15 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -5747,15 +5521,13 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-GISEL-LABEL: v_mul_v2f16_select_1_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5773,14 +5545,14 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-GISEL-LABEL: v_mul_v2f16_select_1_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5798,14 +5570,15 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -5830,15 +5603,14 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5856,14 +5628,15 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5881,14 +5654,16 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -5913,15 +5688,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-GISEL-LABEL: v_mul_v2f16_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5939,14 +5718,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-GISEL-LABEL: v_mul_v2f16_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5964,14 +5748,20 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x5400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -5996,15 +5786,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6022,14 +5817,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6047,14 +5848,21 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xd400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -64.0, half -64.0>
@@ -6079,15 +5887,14 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xcc00
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 4, 7, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6105,14 +5912,15 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xcc00
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 4, 7, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6130,14 +5938,16 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xcc00
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 4, 7, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 4, 7, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -16.0, half -16.0>
@@ -6162,15 +5972,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x3c00
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6188,14 +5997,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6213,14 +6023,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -6246,15 +6058,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x3c00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x5400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6272,14 +6083,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6297,14 +6109,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -6330,15 +6144,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xbc00
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6356,14 +6170,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6381,14 +6197,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 6, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 6, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half -64.0, half -64.0>, <2 x half> <half -1.0, half -1.0>
@@ -6414,15 +6233,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xbc00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0xd400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6440,14 +6259,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6465,14 +6286,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0xd400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 6, 0, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -6498,15 +6322,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x5400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6524,14 +6353,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6549,14 +6384,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v5, v1
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -6582,15 +6423,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x4400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 2, 7, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6608,14 +6448,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 2, 7, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6633,14 +6474,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 2, 7, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 2, 7, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 4.0, half 4.0>
@@ -6666,15 +6509,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x4400
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 2, v1
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT: v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6692,14 +6540,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 2, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6717,14 +6571,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1
+; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v5, v1
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 4.0, half 4.0>
@@ -6750,15 +6610,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 0x5800
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 7, 2, vcc
+; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6776,14 +6635,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 7, 2, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT: v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6801,14 +6661,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0x5800
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 7, 2, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 7, 2, vcc_lo
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v1
; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%cond = icmp eq <2 x i32> %arg, zeroinitializer
%select.pow2 = select <2 x i1> %cond, <2 x half> <half 4.0, half 4.0>, <2 x half> <half 128.0, half 128.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index f6ee007facd7..80b4d64b1236 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -14,6 +14,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
; Test patterns to match v_fract_* instructions.
@@ -103,6 +104,21 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
; GFX11-NEXT: global_store_b32 v[1:2], v4, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v3, v0
+; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT: v_floor_f32_e32 v4, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT: global_store_b32 v[1:2], v4, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -181,6 +197,18 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: global_store_b32 v[1:2], v3, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f32_noinf_check:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v3, v0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: global_store_b32 v[1:2], v3, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -263,6 +291,22 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
; GFX11-NEXT: v_min_f32_e32 v4, 0x3f7fffff, v4
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_nan_check_math_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v3, v0
+; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3
+; GFX12-NEXT: global_store_b32 v[1:2], v3, off
+; GFX12-NEXT: v_min_num_f32_e32 v4, 0x3f7fffff, v4
+; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -314,6 +358,16 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_nonans:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -362,6 +416,19 @@ define float @basic_fract_f32_flags_minnum(float %x) {
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_flags_minnum:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -409,6 +476,16 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_flags_fsub:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub nsz float %x, %floor
@@ -467,6 +544,17 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) {
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: v_fract_f32_e32 v1, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_v2f32_nonans:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: v_fract_f32_e32 v1, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
%sub = fsub <2 x float> %x, %floor
@@ -540,6 +628,20 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: global_store_b32 v[1:2], v3, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f32_multi_use_fsub_nonans:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v3, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v3, v0, v3
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: global_store_b32 v[1:2], v3, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -588,6 +690,16 @@ define float @nnan_minnum_fract_f32(float %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_minnum_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -638,6 +750,19 @@ define float @nnan_fsub_fract_f32(float %x) {
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_fsub_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub nnan float %x, %floor
@@ -686,6 +811,19 @@ define float @nnan_floor_fract_f32(float %x) {
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_floor_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call nnan float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -733,6 +871,16 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: nnan_src_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -782,6 +930,19 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7ffffe, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_wrong_const:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7ffffe, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -831,6 +992,19 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
; GFX11-NEXT: v_sub_f32_e32 v0, v1, v0
; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_swapped_fsub:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %floor, %x
@@ -880,6 +1054,19 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_not_floor:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_trunc_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.trunc.f32(float %x)
%sub = fsub float %x, %floor
@@ -929,6 +1116,19 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_min_f32_e32 v0, 0x3f7fffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_different_floor:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_min_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %y)
%sub = fsub float %x, %floor
@@ -978,6 +1178,19 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX11-NEXT: v_max_f32_e32 v0, 0x3f7fffff, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: not_fract_f32_maxnum:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_max_num_f32_e32 v0, 0x3f7fffff, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -1000,6 +1213,15 @@ define float @fcmp_uno_check_is_nan_f32(float %x) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_uno_check_is_nan_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -1054,6 +1276,16 @@ define float @select_nan_fract_f32(float %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -1107,6 +1339,16 @@ define float @commuted_select_nan_fract_f32(float %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: commuted_select_nan_fract_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -1168,6 +1410,22 @@ define float @wrong_commuted_nan_select_f32(float %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: wrong_commuted_nan_select_f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f32_e32 v1, v0
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_sub_f32_e32 v1, v0, v1
+; GFX12-NEXT: v_min_num_f32_e32 v1, 0x3f7fffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -1231,6 +1489,16 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f16_nonan:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f16_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call half @llvm.floor.f16(half %x)
%sub = fsub half %x, %floor
@@ -1313,6 +1581,20 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
; GFX11-NEXT: v_fract_f16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_v2f16_nonan:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT: v_fract_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_fract_f16_e32 v1, v1
+; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
%sub = fsub <2 x half> %x, %floor
@@ -1369,6 +1651,16 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: basic_fract_f64_nanans:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call double @llvm.floor.f64(double %x)
%sub = fsub double %x, %floor
@@ -1461,6 +1753,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture
; GFX11-NEXT: v_fract_f16_e32 v0, v0
; GFX11-NEXT: global_store_b16 v[1:2], v3, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f16_noinf_check:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f16_e32 v3, v0
+; GFX12-NEXT: v_fract_f16_e32 v0, v0
+; GFX12-NEXT: global_store_b16 v[1:2], v3, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call half @llvm.floor.f16(half %x)
%sub = fsub half %x, %floor
@@ -1546,6 +1850,18 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap
; GFX11-NEXT: v_fract_f64_e32 v[0:1], v[0:1]
; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f64_noinf_check:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1]
+; GFX12-NEXT: v_fract_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call double @llvm.floor.f64(double %x)
%sub = fsub double %x, %floor
@@ -1600,6 +1916,16 @@ define float @select_nan_fract_f32_flags_select(float %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32_flags_select:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -1653,6 +1979,16 @@ define float @select_nan_fract_f32_flags_minnum(float %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: select_nan_fract_f32_flags_minnum:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
%sub = fsub float %x, %floor
@@ -1769,6 +2105,25 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off
; GFX11-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f32_e32 v6, v0
+; GFX12-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204
+; GFX12-NEXT: v_fract_f32_e32 v7, v1
+; GFX12-NEXT: v_floor_f32_e32 v4, v0
+; GFX12-NEXT: v_floor_f32_e32 v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX12-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204
+; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off
+; GFX12-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
%sub = fsub <2 x float> %x, %floor
@@ -1881,6 +2236,21 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
+; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call double @llvm.floor.f64(double %x)
%sub = fsub double %x, %floor
@@ -2002,6 +2372,21 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
; GFX11-NEXT: global_store_b16 v[1:2], v4, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f16_e32 v3, v0
+; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
+; GFX12-NEXT: v_floor_f16_e32 v4, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT: global_store_b16 v[1:2], v4, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call half @llvm.floor.f16(half %x)
%sub = fsub half %x, %floor
@@ -2168,6 +2553,29 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
; GFX11-NEXT: global_store_b32 v[1:2], v4, off
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: v_fract_f16_e32 v6, v0
+; GFX12-NEXT: v_floor_f16_e32 v5, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_fract_f16_e32 v4, v3
+; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204
+; GFX12-NEXT: v_floor_f16_e32 v7, v3
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0
+; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX12-NEXT: global_store_b32 v[1:2], v4, off
+; GFX12-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
%sub = fsub <2 x half> %x, %floor
@@ -2311,6 +2719,26 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
; GFX11-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1
; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: safe_math_fract_v2f64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fract_f64_e32 v[10:11], v[0:1]
+; GFX12-NEXT: v_cmp_class_f64_e64 s0, v[0:1], 0x204
+; GFX12-NEXT: v_fract_f64_e32 v[12:13], v[2:3]
+; GFX12-NEXT: v_cmp_class_f64_e64 s1, v[2:3], 0x204
+; GFX12-NEXT: v_floor_f64_e32 v[8:9], v[2:3]
+; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v10, 0, s0
+; GFX12-NEXT: v_cndmask_b32_e64 v1, v11, 0, s0
+; GFX12-NEXT: v_cndmask_b32_e64 v2, v12, 0, s1
+; GFX12-NEXT: v_cndmask_b32_e64 v3, v13, 0, s1
+; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
%sub = fsub <2 x double> %x, %floor
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index b3001819e9aa..c1d5b5857b6b 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -2380,14 +2380,12 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
; GISEL-IEEE: ; %bb.0:
; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000
-; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4b800000
; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GISEL-IEEE-NEXT: v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x45800000
-; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
@@ -2734,20 +2732,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
; GISEL-IEEE: ; %bb.0:
; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x800000
-; GISEL-IEEE-NEXT: v_mov_b32_e32 v3, 0x4b800000
; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v3, 0, 24, vcc
; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v3
+; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, s[4:5]
; GISEL-IEEE-NEXT: v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GISEL-IEEE-NEXT: v_rsq_f32_e32 v1, v1
-; GISEL-IEEE-NEXT: v_mov_b32_e32 v4, 0x45800000
-; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2
; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index cefd24032871..85c657789339 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -18,7 +18,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = COPY %0.sub0
@@ -43,7 +43,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = COPY %0.sub0
@@ -68,7 +68,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub0
%2 = COPY %0.sub1
@@ -90,7 +90,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 157f91ccc6b1..b2f113f08a91 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_add_i32 s3, s3, 1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_add_i32 s3, s3, 1
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT: s_mov_b32 s1, 1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, 1
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
@@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_add_i32 s3, s3, -1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_add_i32 s3, s3, -1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
-; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_add_i32 s3, s3, -1
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT: s_mov_b32 s1, -1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, -1
+; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
new file mode 100644
index 000000000000..503f27edf70d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -0,0 +1,204 @@
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -simplify-mir -start-before=greedy,2 -stress-regalloc=4 -stop-before=virtregrewriter,2 -o - -verify-regalloc %s 2> %t.err | FileCheck %s
+# RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+# To allocate the vreg_512_align2, the allocation will attempt to
+# inflate the register class to av_512_align2. This will ultimately
+# not work, and the allocation will fail. There is an unproductive
+# live range split, and we end up with a snippet copy of an
+# unspillable register. Recursive assignment of interfering ranges
+# during last chance recoloring would delete the unspillable snippet
+# live range. Make sure there's no use after free when rolling back
+# the last chance assignment.
+
+# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free'
+# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free_lane_subset'
+
+--- |
+ define amdgpu_kernel void @inflated_reg_class_copy_use_after_free() {
+ ret void
+ }
+
+ define amdgpu_kernel void @inflated_reg_class_copy_use_after_free_lane_subset() {
+ ret void
+ }
+
+...
+
+# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free
+# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: early-clobber [[MFMA0:%[0-9]+]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[MFMA0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[MFMA0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
+# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
+# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
+# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[MFMA_USE1]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[MFMA_USE1]], 0, 0, 0, implicit $mode, implicit $exec
+
+---
+name: inflated_reg_class_copy_use_after_free
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75'
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 7
+ vgprForAGPRCopy: '$vgpr255'
+ sgprForEXECCopy: '$sgpr74_sgpr75'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %0:vgpr_32 = IMPLICIT_DEF
+ renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
+ S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
+ S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
+ S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
+ S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
+ early-clobber %2:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ %1.sub2:vreg_512_align2 = COPY %2.sub3
+ %1.sub3:vreg_512_align2 = COPY %2.sub2
+ %1.sub4:vreg_512_align2 = COPY %2.sub0
+ %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# This test is similar to except it is still broken when the use
+# instruction does not read the full set of lanes after one attempted fix.
+
+# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free_lane_subset
+# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE_0:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: S_NOP 0, implicit-def early-clobber [[REG1:%[0-9]+]], implicit [[RESTORE_0]].sub0_sub1_sub2_sub3, implicit [[RESTORE_0]].sub4_sub5_sub6_sub7
+# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[REG1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY [[REG1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE_1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE_1]].sub0_sub1
+# CHECK-NEXT: [[RESTORE_2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE_2]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[RESTORE_2]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: undef [[SPLIT5:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT4]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT5]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT3]].sub2:av_512_align2 = COPY [[SPLIT5]].sub3
+# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT3]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
+# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT5]].sub0 {
+# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT5]].sub2
+# CHECK-NEXT: }
+# CHECK-NEXT: [[SPLIT7]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
+# CHECK-NEXT: undef [[SPLIT9:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT7]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: undef [[LAST_USE:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
+# CHECK-NEXT: [[LAST_USE]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[LAST_USE]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: [[LAST_USE]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+# CHECK-NEXT: S_NOP 0, implicit-def [[LAST_USE]], implicit [[LAST_USE]].sub0_sub1_sub2_sub3, implicit [[LAST_USE]].sub4_sub5_sub6_sub7, implicit [[LAST_USE]].sub8_sub9_sub10_sub11
+
+---
+name: inflated_reg_class_copy_use_after_free_lane_subset
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75'
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 7
+ vgprForAGPRCopy: '$vgpr255'
+ sgprForEXECCopy: '$sgpr74_sgpr75'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ %0:vgpr_32 = IMPLICIT_DEF
+ renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
+ S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
+ S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
+ S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
+ S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
+ S_NOP 0, implicit-def early-clobber %2:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7
+ %1.sub2:vreg_512_align2 = COPY %2.sub3
+ %1.sub3:vreg_512_align2 = COPY %2.sub2
+ %1.sub4:vreg_512_align2 = COPY %2.sub0
+ %1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ %1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ S_NOP 0, implicit-def %1:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7, implicit %1.sub8_sub9_sub10_sub11
+ GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index cf9fdbdc3439..2ceaca3497ec 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %12
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:SGPR_128 */, def %10
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:SGPR_128 */, [[COPY]]
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
call void asm sideeffect "; use $0", "s"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll
new file mode 100644
index 000000000000..1a87887e28d7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue121601-combine-concat-vectors-assumes-f16.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+define <4 x float> @issue121601(bfloat %fptrunc) {
+; CHECK-LABEL: issue121601:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %bitcast = bitcast bfloat %fptrunc to <1 x bfloat>
+ %shufflevector = shufflevector <1 x bfloat> %bitcast, <1 x bfloat> zeroinitializer, <2 x i32> zeroinitializer
+ %fpext = fpext <2 x bfloat> %shufflevector to <2 x float>
+ %shufflevector1 = shufflevector <2 x float> %fpext, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %shufflevector1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 3ff759a5cdb9..867025adca94 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -4,6 +4,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX6-LABEL: cos_f16:
@@ -80,6 +81,19 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_cos_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: cos_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_cos_f16_e32 v1, v1
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%a.val = load half, ptr addrspace(1) %a
%r.val = call half @llvm.cos.f16(half %a.val)
store half %r.val, ptr addrspace(1) %r
@@ -188,6 +202,24 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: cos_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX12-NEXT: v_cos_f16_e32 v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: v_cos_f16_e32 v2, v2
+; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%a.val = load <2 x half>, ptr addrspace(1) %a
%r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
store <2 x half> %r.val, ptr addrspace(1) %r
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index ac515808a0d8..333d428c84bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -41,10 +41,10 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; SI-GISEL-NEXT: s_mov_b32 s2, -1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-GISEL-NEXT: s_endpgm
;
@@ -78,9 +78,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v2, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
@@ -115,9 +115,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
@@ -203,7 +203,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000
+; SI-GISEL-NEXT: v_not_b32_e32 v2, 63
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -213,10 +213,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-GISEL-NEXT: v_add_f32_e32 v0, s7, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v3, v3
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v3, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v2
; SI-GISEL-NEXT: s_mov_b32 s6, -1
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -252,7 +252,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000
+; VI-GISEL-NEXT: v_not_b32_e32 v2, 63
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -262,10 +262,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-GISEL-NEXT: v_add_f32_e32 v0, s7, v0
; VI-GISEL-NEXT: v_exp_f32_e32 v3, v3
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v3, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,7 +300,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000
+; GFX900-GISEL-NEXT: v_not_b32_e32 v2, 63
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0
@@ -310,10 +310,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s11, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v3
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v3, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v1, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX900-GISEL-NEXT: s_endpgm
@@ -421,17 +421,17 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000
+; SI-GISEL-NEXT: v_not_b32_e32 v3, 63
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
; SI-GISEL-NEXT: v_add_f32_e32 v0, s0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1
; SI-GISEL-NEXT: s_mov_b32 s6, -1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v4
; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
; SI-GISEL-NEXT: v_add_f32_e32 v4, s1, v4
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -439,11 +439,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; SI-GISEL-NEXT: v_exp_f32_e32 v4, v4
; SI-GISEL-NEXT: v_add_f32_e32 v1, s2, v1
; SI-GISEL-NEXT: v_exp_f32_e32 v2, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
; SI-GISEL-NEXT: s_endpgm
@@ -487,16 +487,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000
+; VI-GISEL-NEXT: v_not_b32_e32 v3, 63
; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v0, s0, v0
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v4
; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, s1, v4
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -504,10 +504,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_add_f32_e32 v1, s2, v1
; VI-GISEL-NEXT: v_exp_f32_e32 v4, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v2, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT: v_ldexp_f32 v1, v4, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v2, v2, v3
; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
@@ -551,15 +551,15 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000
+; GFX900-GISEL-NEXT: v_not_b32_e32 v3, 63
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v4
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
; GFX900-GISEL-NEXT: v_add_f32_e32 v4, s1, v4
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -567,10 +567,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s2, v1
; GFX900-GISEL-NEXT: v_exp_f32_e32 v4, v4
; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v4, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX900-GISEL-NEXT: s_endpgm
@@ -710,7 +710,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000
+; SI-GISEL-NEXT: v_not_b32_e32 v4, 63
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -720,22 +720,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; SI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1
; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v5
; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1]
; SI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5
; SI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2
; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v3, v3, v4
; SI-GISEL-NEXT: s_mov_b32 s6, -1
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -787,7 +787,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000
+; VI-GISEL-NEXT: v_not_b32_e32 v4, 63
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -797,22 +797,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v5
; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1]
; VI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5
; VI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2
; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; VI-GISEL-NEXT: v_exp_f32_e32 v3, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; VI-GISEL-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-GISEL-NEXT: v_ldexp_f32 v3, v3, v4
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -863,7 +863,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000
+; GFX900-GISEL-NEXT: v_not_b32_e32 v4, 63
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -873,22 +873,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s9, v1
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v5
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v1, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1]
; GFX900-GISEL-NEXT: v_add_f32_e32 v5, s10, v5
; GFX900-GISEL-NEXT: v_add_f32_e32 v2, s11, v2
; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v5
; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v5, v2
+; GFX900-GISEL-NEXT: v_ldexp_f32 v3, v3, v4
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
@@ -1006,19 +1006,19 @@ define float @v_exp2_f32(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32:
; VI-SDAG: ; %bb.0:
@@ -1034,6 +1034,20 @@ define float @v_exp2_f32(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1048,6 +1062,20 @@ define float @v_exp2_f32(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1076,19 +1104,19 @@ define float @v_exp2_fabs_f32(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_fabs_f32:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fabs_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_fabs_f32:
; VI-SDAG: ; %bb.0:
@@ -1104,6 +1132,20 @@ define float @v_exp2_fabs_f32(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_fabs_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_fabs_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1118,6 +1160,20 @@ define float @v_exp2_fabs_f32(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_fabs_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_fabs_f32:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1147,19 +1203,19 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0|
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0|
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_fneg_fabs_f32:
; VI-SDAG: ; %bb.0:
@@ -1175,6 +1231,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0|
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1189,6 +1259,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e64 v0, v1, |v0|
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_fneg_fabs_f32:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1219,19 +1303,19 @@ define float @v_exp2_fneg_f32(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_fneg_f32:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_fneg_f32:
; VI-SDAG: ; %bb.0:
@@ -1247,6 +1331,20 @@ define float @v_exp2_fneg_f32(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_fneg_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_fneg_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1261,6 +1359,20 @@ define float @v_exp2_fneg_f32(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_fneg_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_fneg_f32:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1290,19 +1402,19 @@ define float @v_exp2_f32_fast(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_fast:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_fast:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_fast:
; VI-SDAG: ; %bb.0:
@@ -1318,6 +1430,20 @@ define float @v_exp2_f32_fast(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_fast:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1332,6 +1458,20 @@ define float @v_exp2_f32_fast(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_fast:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1360,19 +1500,19 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
; VI-SDAG: ; %bb.0:
@@ -1388,6 +1528,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1402,6 +1556,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_unsafe_math_attr:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1430,19 +1598,19 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
; VI-SDAG: ; %bb.0:
@@ -1458,6 +1626,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1472,6 +1654,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_approx_fn_attr:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1500,19 +1696,19 @@ define float @v_exp2_f32_ninf(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_ninf:
; VI-SDAG: ; %bb.0:
@@ -1528,6 +1724,20 @@ define float @v_exp2_f32_ninf(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_ninf:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1542,6 +1752,20 @@ define float @v_exp2_f32_ninf(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_ninf:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1570,19 +1794,19 @@ define float @v_exp2_f32_afn(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_afn:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_afn:
; VI-SDAG: ; %bb.0:
@@ -1598,6 +1822,20 @@ define float @v_exp2_f32_afn(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1612,6 +1850,20 @@ define float @v_exp2_f32_afn(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_afn:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1660,19 +1912,19 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_afn_dynamic:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_afn_dynamic:
; VI-SDAG: ; %bb.0:
@@ -1688,6 +1940,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_afn_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1702,6 +1968,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_afn_dynamic:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1730,19 +2010,19 @@ define float @v_fabs_exp2_f32_afn(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_fabs_exp2_f32_afn:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_fabs_exp2_f32_afn:
; VI-SDAG: ; %bb.0:
@@ -1758,6 +2038,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_fabs_exp2_f32_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1772,6 +2066,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_fabs_exp2_f32_afn:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1821,19 +2129,19 @@ define float @v_exp2_f32_nnan(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_nnan:
; VI-SDAG: ; %bb.0:
@@ -1849,6 +2157,20 @@ define float @v_exp2_f32_nnan(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_nnan:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_nnan:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,6 +2185,20 @@ define float @v_exp2_f32_nnan(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_nnan:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -1911,19 +2247,19 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
; VI-SDAG: ; %bb.0:
@@ -1939,6 +2275,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1953,6 +2303,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_nnan_dynamic:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -2001,19 +2365,19 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
; VI-SDAG: ; %bb.0:
@@ -2029,6 +2393,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2043,6 +2421,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_ninf_dynamic:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -2071,19 +2463,19 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf:
; VI-SDAG: ; %bb.0:
@@ -2099,6 +2491,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2113,6 +2519,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_nnan_ninf:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -2161,19 +2581,19 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
; VI-SDAG: ; %bb.0:
@@ -2189,6 +2609,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2203,6 +2637,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -2251,19 +2699,19 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_dynamic_mode:
; VI-SDAG: ; %bb.0:
@@ -2279,6 +2727,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_dynamic_mode:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2293,6 +2755,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_dynamic_mode:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -2313,20 +2789,50 @@ define float @v_exp2_f32_undef() {
; GCN-SDAG-NEXT: v_exp_f32_e32 v0, 0x7fc00000
; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_undef:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GCN-GISEL-NEXT: v_add_f32_e32 v1, s4, v1
-; GCN-GISEL-NEXT: v_add_f32_e64 v2, s4, 0
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_undef:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
+; SI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_exp2_f32_undef:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
+; VI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp2_f32_undef:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
+; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT: v_add_f32_e64 v2, s4, 0
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp2_f32_undef:
; R600: ; %bb.0:
@@ -3359,19 +3865,19 @@ define float @v_exp2_f32_contract(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_contract:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_contract:
; VI-SDAG: ; %bb.0:
@@ -3387,6 +3893,20 @@ define float @v_exp2_f32_contract(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_contract:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_contract:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3401,6 +3921,20 @@ define float @v_exp2_f32_contract(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_contract:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -3449,19 +3983,19 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
; VI-SDAG: ; %bb.0:
@@ -3477,6 +4011,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3491,6 +4039,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; R600-LABEL: v_exp2_f32_contract_nnan_ninf:
; R600: ; %bb.0:
; R600-NEXT: CF_END
@@ -3518,3 +4080,5 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2
attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index b9fef0834cb2..88ef7a936393 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -3,11 +3,13 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
define { half, i32 } @test_frexp_f16_i32(half %a) {
; GFX6-SDAG-LABEL: test_frexp_f16_i32:
@@ -50,6 +52,19 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f16_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f16_i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -96,6 +111,16 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -145,6 +170,18 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -221,6 +258,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX11-NEXT: v_bfe_i32 v2, v4, 0, 16
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_v2f16_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_frexp_mant_f16_e32 v3, v1
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v4, v1
+; GFX12-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_pack_b32_f16 v0, v2, v3
+; GFX12-NEXT: v_bfe_i32 v2, v4, 0, 16
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -311,6 +367,20 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_frexp_mant_f16_e32 v1, v1
+; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,6 +456,22 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
; GFX11-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v1
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -463,6 +549,19 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f16_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f16_e32 v2, v0
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f16_i16:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -509,6 +608,16 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) {
; GFX11-NEXT: v_frexp_mant_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f16_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -554,6 +663,16 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
; GFX11-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -623,6 +742,19 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f32_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f32_e32 v2, v0
+; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f32_i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -665,6 +797,16 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) {
; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f32_i32_only_use_fract:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_fract:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -706,6 +848,16 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f32_i32_only_use_exp:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_exp:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -771,6 +923,21 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_v2f32_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f32_e32 v4, v0
+; GFX12-NEXT: v_frexp_mant_f32_e32 v5, v1
+; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
+; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v3, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -846,6 +1013,17 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX12-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,6 +1074,17 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GFX12-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -954,6 +1143,19 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f64_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f64_e32 v[3:4], v[0:1]
+; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v2, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f64_i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1000,6 +1202,16 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) {
; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f64_i32_only_use_fract:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1044,6 +1256,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_f64_i32_only_use_exp:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1116,6 +1338,22 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
+; GFX12-LABEL: test_frexp_v2f64_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f64_e32 v[8:9], v[0:1]
+; GFX12-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3]
+; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v5, v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX12-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32:
; GFX6-GISEL: ; %bb.0:
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1174,6 +1412,17 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
; GFX11-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1]
; GFX11-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
%result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0
ret <2 x double> %result.0
@@ -1213,6 +1462,17 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1]
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX12-NEXT: v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
%result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1
ret <2 x i32> %result.1
@@ -1235,3 +1495,5 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
; GCN: {{.*}}
; GFX11-GISEL: {{.*}}
; GFX11-SDAG: {{.*}}
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 218e41faa703..b850428a03c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
; SI-GISEL-NEXT: s_mov_b32 s6, -1
+; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0
; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4
; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-GISEL-NEXT: s_endpgm
;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
+; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x41b17218
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
; SI-GISEL-NEXT: s_mov_b32 s6, -1
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v1
+; SI-GISEL-NEXT: v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT: v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v5, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218
; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
-; SI-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v5
+; SI-GISEL-NEXT: v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT: v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1]
; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT: v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT: v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT: v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT: v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT: v_log_f32_e32 v3, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41b17218
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT: v_sub_f32_e32 v5, v3, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v7, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1]
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v1
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v4, v7
; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v6
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1
+; GFX1100-GISEL-NEXT: v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v5, 0x3377d1cf, v1
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
; GFX1100-GISEL-NEXT: v_fma_f32 v4, 0x3f317217, v0, -v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3f317217, v1, -v3
-; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v4, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v2, v2, v4
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf
-; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
; SI-GISEL-NEXT: s_mov_b32 s6, -1
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v0
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT: v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT: v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v6
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT: v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v8
+; SI-GISEL-NEXT: v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT: v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT: v_ldexp_f32 v3, s9, v3
; VI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT: v_sub_f32_e32 v6, v3, v5
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT: v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT: v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v3
; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v2, -v6
; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7
; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT: v_ldexp_f32 v6, s9, v6
; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v6
; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v6
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1
; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v1
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3]
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1]
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v9
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v8, 0x3377d1cf, v2
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v5, v5, v8
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v6, 0x3f317217, v0, -v3
+; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v2
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v1
; GFX1100-GISEL-NEXT: v_fma_f32 v7, 0x3f317217, v1, -v4
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v6, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3f317217, v2, -v5
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v8, 0x3377d1cf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3377d1cf
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x3377d1cf
-; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
; SI-GISEL-NEXT: s_mov_b32 s6, -1
+; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5
; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s9, v1
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v1
-; SI-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v1
+; SI-GISEL-NEXT: v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT: v_fma_f32 v8, v1, v4, v8
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT: v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT: v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v9
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT: v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3
-; SI-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT: v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v8
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT: v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT: v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v9
+; SI-GISEL-NEXT: v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT: v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT: v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-GISEL-NEXT: s_endpgm
;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s9, v1
; VI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT: v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT: v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT: v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT: v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT: v_sub_f32_e32 v7, v6, v5
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v10, 0x3805fdf4, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317000, v8
-; VI-GISEL-NEXT: v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317000, v7
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT: v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT: v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT: v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3377d1cf
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3377d1cf
; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v1
; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7
; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s9, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x41b17218
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v1
-; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v3, -v8
; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT: v_ldexp_f32 v9, s10, v9
; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v9
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3]
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8
; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v9
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT: v_ldexp_f32 v2, s11, v2
; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT: v_log_f32_e32 v10, v2
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3]
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3
-; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT: v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v10
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v5
; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v7, 0x3f317217, v2
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v7, v7, v12
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v6, 0x3f317217, v1
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fma_f32 v10, 0x3f317217, v0, -v5
; GFX1100-GISEL-NEXT: v_fma_f32 v11, 0x3f317217, v1, -v6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3f317217, v2, -v7
-; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v5, v[0:3], s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2175,16 +2214,16 @@ define float @v_log_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2224,10 +2263,10 @@ define float @v_log_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2270,21 +2309,22 @@ define float @v_log_f32(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2329,10 +2369,10 @@ define float @v_log_fabs_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2378,16 +2418,16 @@ define float @v_log_fabs_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2427,10 +2467,10 @@ define float @v_log_fabs_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2475,20 +2515,22 @@ define float @v_log_fabs_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0|
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2534,10 +2576,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2583,16 +2625,16 @@ define float @v_log_fneg_fabs_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2632,10 +2674,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2680,20 +2722,22 @@ define float @v_log_fneg_fabs_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2740,10 +2784,10 @@ define float @v_log_fneg_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2789,16 +2833,16 @@ define float @v_log_fneg_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2838,10 +2882,10 @@ define float @v_log_fneg_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -2885,20 +2929,22 @@ define float @v_log_fneg_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -3304,10 +3350,10 @@ define float @v_log_f32_ninf(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -3353,16 +3399,16 @@ define float @v_log_f32_ninf(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -3402,10 +3448,10 @@ define float @v_log_f32_ninf(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -3448,21 +3494,22 @@ define float @v_log_f32_ninf(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4038,10 +4085,10 @@ define float @v_log_f32_nnan(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -4087,16 +4134,16 @@ define float @v_log_f32_nnan(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4136,10 +4183,10 @@ define float @v_log_f32_nnan(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -4182,21 +4229,22 @@ define float @v_log_f32_nnan(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4381,10 +4429,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -4430,16 +4478,16 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4479,10 +4527,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -4525,21 +4573,22 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4724,10 +4773,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -4773,16 +4822,16 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4822,10 +4871,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -4868,21 +4917,22 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4924,10 +4974,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -4967,16 +5017,16 @@ define float @v_log_f32_nnan_ninf(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317000, v0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5010,10 +5060,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -5051,18 +5101,20 @@ define float @v_log_f32_nnan_ninf(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -5207,10 +5259,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -5250,16 +5302,16 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317000, v0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5293,10 +5345,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -5334,18 +5386,20 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -5419,10 +5473,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -5468,16 +5522,16 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5517,10 +5571,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
@@ -5563,21 +5617,22 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index fd50d1b60fbd..d09df7583733 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
; SI-GISEL-NEXT: s_mov_b32 s6, -1
+; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0
; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4
; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b
; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-GISEL-NEXT: s_endpgm
;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
+; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x411a209b
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log10_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
; SI-GISEL-NEXT: s_mov_b32 s6, -1
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v1
+; SI-GISEL-NEXT: v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT: v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v5, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b
; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
-; SI-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v5
+; SI-GISEL-NEXT: v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT: v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1]
; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT: v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT: v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT: v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT: v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT: v_log_f32_e32 v3, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x411a209b
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT: v_sub_f32_e32 v5, v3, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v7, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[0:1]
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v1
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v1, v4, v7
; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v6
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3
; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT: v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v5, 0x3284fbcf, v1
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
; GFX1100-GISEL-NEXT: v_fma_f32 v4, 0x3e9a209a, v0, -v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_fma_f32 v5, 0x3e9a209a, v1, -v3
-; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v4, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v2, v2, v4
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf
-; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
; SI-GISEL-NEXT: s_mov_b32 s6, -1
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v0
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT: v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT: v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v6
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT: v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v8
+; SI-GISEL-NEXT: v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT: v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v2
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT: v_ldexp_f32 v3, s9, v3
; VI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT: v_sub_f32_e32 v6, v3, v5
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT: v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1]
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT: v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT: v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v3
; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v2, -v6
; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, v7
; GFX900-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT: v_ldexp_f32 v6, s9, v6
; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v6
; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v6
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s10, v1
; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v1
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, v8, s[2:3]
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v7, s[0:1]
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT: v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v9
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT: v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4
; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v8, 0x3284fbcf, v2
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v5, v5, v8
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v6, 0x3e9a209a, v0, -v3
+; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v1
; GFX1100-GISEL-NEXT: v_fma_f32 v7, 0x3e9a209a, v1, -v4
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v6, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT: v_fma_f32 v8, 0x3e9a209a, v2, -v5
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v8, 0x3284fbcf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3284fbcf
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x3284fbcf
-; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x7f800000
; SI-GISEL-NEXT: s_mov_b32 s6, -1
+; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT: v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT: v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v5
; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s9, v1
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v1
-; SI-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v1
+; SI-GISEL-NEXT: v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT: v_fma_f32 v8, v1, v4, v8
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT: v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT: v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v9
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT: v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT: v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3
-; SI-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT: v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v8
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT: v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT: v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v9
+; SI-GISEL-NEXT: v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT: v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT: v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-GISEL-NEXT: s_endpgm
;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s9, v1
; VI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT: v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v7, v8, v7
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT: v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT: v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT: v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT: v_sub_f32_e32 v7, v6, v5
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v10, 0x369a84fb, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a2000, v8
-; VI-GISEL-NEXT: v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT: v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT: v_sub_f32_e32 v7, v3, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v7
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
; VI-GISEL-NEXT: v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT: v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT: v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT: v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT: v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v8, v9, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3284fbcf
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x3284fbcf
; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v3, -v1
; GFX900-GISEL-NEXT: v_fma_f32 v7, v0, v5, v7
; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v6
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s9, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, 0x411a209b
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v8
; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v3, -v8
; GFX900-GISEL-NEXT: v_fma_f32 v9, v1, v5, v9
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT: v_ldexp_f32 v9, s10, v9
; GFX900-GISEL-NEXT: v_log_f32_e32 v9, v9
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, v6
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[2:3]
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v7, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v8
; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v9
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
; GFX900-GISEL-NEXT: v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT: v_ldexp_f32 v2, s11, v2
; GFX900-GISEL-NEXT: v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT: v_log_f32_e32 v10, v2
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], |v9|, v6
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v8, s[2:3]
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3
-; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT: v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT: v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v10
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT: v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT: v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v5
; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v7, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
+; GFX1100-GISEL-NEXT: v_add_f32_e32 v7, v7, v12
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT: v_mul_f32_e32 v6, 0x3e9a209a, v1
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fma_f32 v10, 0x3e9a209a, v0, -v5
; GFX1100-GISEL-NEXT: v_fma_f32 v11, 0x3e9a209a, v1, -v6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_fma_f32 v12, 0x3e9a209a, v2, -v7
-; GFX1100-GISEL-NEXT: v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
; GFX1100-GISEL-NEXT: v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v5, v[0:3], s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log10_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log10_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2175,16 +2214,16 @@ define float @v_log10_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2224,10 +2263,10 @@ define float @v_log10_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2270,21 +2309,22 @@ define float @v_log10_f32(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2329,10 +2369,10 @@ define float @v_log10_fabs_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2378,16 +2418,16 @@ define float @v_log10_fabs_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2427,10 +2467,10 @@ define float @v_log10_fabs_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2475,20 +2515,22 @@ define float @v_log10_fabs_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0|
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2534,10 +2576,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2583,16 +2625,16 @@ define float @v_log10_fneg_fabs_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2632,10 +2674,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2680,20 +2722,22 @@ define float @v_log10_fneg_fabs_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2740,10 +2784,10 @@ define float @v_log10_fneg_f32(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2789,16 +2833,16 @@ define float @v_log10_fneg_f32(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2838,10 +2882,10 @@ define float @v_log10_fneg_f32(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -2885,20 +2929,22 @@ define float @v_log10_fneg_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -3304,10 +3350,10 @@ define float @v_log10_f32_ninf(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -3353,16 +3399,16 @@ define float @v_log10_f32_ninf(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -3402,10 +3448,10 @@ define float @v_log10_f32_ninf(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -3448,21 +3494,22 @@ define float @v_log10_f32_ninf(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4038,10 +4085,10 @@ define float @v_log10_f32_nnan(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -4087,16 +4134,16 @@ define float @v_log10_f32_nnan(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4136,10 +4183,10 @@ define float @v_log10_f32_nnan(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -4182,21 +4229,22 @@ define float @v_log10_f32_nnan(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4381,10 +4429,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -4430,16 +4478,16 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4479,10 +4527,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -4525,21 +4573,22 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4724,10 +4773,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -4773,16 +4822,16 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4822,10 +4871,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -4868,21 +4917,22 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -4924,10 +4974,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -4967,16 +5017,16 @@ define float @v_log10_f32_nnan_ninf(float %in) {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a2000, v0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5010,10 +5060,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -5051,18 +5101,20 @@ define float @v_log10_f32_nnan_ninf(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -5207,10 +5259,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -5250,16 +5302,16 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a2000, v0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5293,10 +5345,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -5334,18 +5386,20 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v0, v1, v2
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -5419,10 +5473,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -5468,16 +5522,16 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5517,10 +5571,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf
@@ -5563,21 +5617,22 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 2c5a9f58a199..8b3b79b0b1bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -36,14 +36,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; SI-GISEL-NEXT: s_load_dword s2, s[4:5], 0xb
; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s2, v0
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; SI-GISEL-NEXT: s_mov_b32 s2, -1
; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -74,13 +74,13 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; VI-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s2, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -108,20 +108,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
;
; GFX900-GISEL-LABEL: s_log2_f32:
; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s2, v0
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -147,20 +146,22 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
;
; GFX1100-GISEL-LABEL: s_log2_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT: s_clause 0x1
+; GFX1100-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log2_f32:
@@ -242,21 +243,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v3, s6, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_log_f32_e32 v1, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s6, v2
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v3, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
+; SI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1
; SI-GISEL-NEXT: s_mov_b32 s6, -1
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -291,21 +293,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s6, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT: v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT: v_log_f32_e32 v1, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v2, s6, v2
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT: v_log_f32_e32 v2, v2
+; VI-GISEL-NEXT: v_log_f32_e32 v3, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v2, v0
+; VI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -339,22 +342,23 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v0
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, s10, v3
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s11, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v3, s10, v3
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s11, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[0:1]
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1
; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -387,23 +391,28 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
; GFX1100-GISEL-LABEL: s_log2_v2f32:
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s5
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
-; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log2_v2f32:
@@ -506,32 +515,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; SI-GISEL-LABEL: s_log2_v3f32:
; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT: s_mov_b32 s6, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1
-; SI-GISEL-NEXT: s_mov_b32 s6, -1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v3, s9, v3
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s10, v1
; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4
-; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT: v_log_f32_e32 v4, v4
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT: v_log_f32_e32 v4, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; SI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2
; SI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
; SI-GISEL-NEXT: s_endpgm
@@ -571,32 +582,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; VI-GISEL-LABEL: s_log2_v3f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000
-; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v3, s9, v3
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s10, v1
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
-; VI-GISEL-NEXT: v_log_f32_e32 v4, v4
-; VI-GISEL-NEXT: v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
-; VI-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5
+; VI-GISEL-NEXT: v_log_f32_e32 v3, v3
+; VI-GISEL-NEXT: v_log_f32_e32 v4, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v1, v3, v1
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
;
@@ -637,28 +650,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GFX900-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s1, v1
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v4, s1, v4
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v4, 5, v4
+; GFX900-GISEL-NEXT: v_ldexp_f32 v4, s1, v4
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s2, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v4, v4
-; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2
; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -702,33 +717,40 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; GFX1100-GISEL-LABEL: s_log2_v3f32:
; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v6, 0
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s1
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s6
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s3
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v6, v[0:2], s[4:5]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log2_v3f32:
@@ -865,34 +887,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT: s_mov_b32 s6, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v1, s9, v1
; SI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v3, s[0:1]
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT: v_log_f32_e32 v5, v5
-; SI-GISEL-NEXT: v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT: s_mov_b32 s6, -1
+; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 5, v4
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v4, s10, v4
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT: v_log_f32_e32 v4, v4
+; SI-GISEL-NEXT: v_log_f32_e32 v5, v2
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2
+; SI-GISEL-NEXT: v_sub_f32_e32 v3, v5, v3
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-GISEL-NEXT: s_endpgm
@@ -942,33 +967,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v1, s9, v1
; VI-GISEL-NEXT: v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v3, s[0:1]
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT: v_log_f32_e32 v5, v5
-; VI-GISEL-NEXT: v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 5, v4
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT: v_ldexp_f32 v4, s10, v4
+; VI-GISEL-NEXT: v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT: v_log_f32_e32 v4, v4
+; VI-GISEL-NEXT: v_log_f32_e32 v5, v2
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT: v_sub_f32_e32 v3, v5, v3
; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -1018,34 +1046,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-GISEL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s8, v0
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v1, s9, v1
; GFX900-GISEL-NEXT: v_log_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[0:1]
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v5, 5, v5
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-GISEL-NEXT: v_ldexp_f32 v5, s10, v5
+; GFX900-GISEL-NEXT: v_ldexp_f32 v2, s11, v2
; GFX900-GISEL-NEXT: v_log_f32_e32 v5, v5
-; GFX900-GISEL-NEXT: v_log_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT: v_log_f32_e32 v6, v2
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v6, v3
; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
@@ -1095,39 +1126,46 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
;
; GFX1100-GISEL-LABEL: s_log2_v4f32:
; GFX1100-GISEL: ; %bb.0:
+; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s1
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s6, 0x800000, s0
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s9
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s6
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v1, s1, v1
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v3, 5, v3
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v3, s3, v3
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_3) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
-; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: v_dual_sub_f32 v3, v3, v7 :: v_dual_lshlrev_b32 v2, 5, v2
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX1100-GISEL-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX1100-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_log2_v4f32:
@@ -1243,19 +1281,19 @@ define float @v_log2_f32(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32:
; VI-SDAG: ; %bb.0:
@@ -1271,6 +1309,20 @@ define float @v_log2_f32(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1285,6 +1337,20 @@ define float @v_log2_f32(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1304,10 +1370,12 @@ define float @v_log2_f32(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1341,19 +1409,19 @@ define float @v_log2_fabs_f32(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_fabs_f32:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fabs_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_fabs_f32:
; VI-SDAG: ; %bb.0:
@@ -1369,6 +1437,20 @@ define float @v_log2_fabs_f32(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_fabs_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_fabs_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1383,6 +1465,20 @@ define float @v_log2_fabs_f32(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_fabs_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_fabs_f32:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1403,10 +1499,11 @@ define float @v_log2_fabs_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0|
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1441,19 +1538,19 @@ define float @v_log2_fneg_fabs_f32(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_fneg_fabs_f32:
; VI-SDAG: ; %bb.0:
@@ -1469,6 +1566,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_fneg_fabs_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1483,6 +1594,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1503,10 +1628,11 @@ define float @v_log2_fneg_fabs_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -|v0|, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -|v0|, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1542,19 +1668,19 @@ define float @v_log2_fneg_f32(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_fneg_f32:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_f32:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_fneg_f32:
; VI-SDAG: ; %bb.0:
@@ -1570,6 +1696,20 @@ define float @v_log2_fneg_f32(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_fneg_f32:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_fneg_f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1584,6 +1724,20 @@ define float @v_log2_fneg_f32(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_fneg_f32:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_fneg_f32:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1604,10 +1758,11 @@ define float @v_log2_fneg_f32(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, -v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, -v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1642,19 +1797,19 @@ define float @v_log2_f32_fast(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_fast:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_fast:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_fast:
; VI-SDAG: ; %bb.0:
@@ -1670,6 +1825,20 @@ define float @v_log2_f32_fast(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_fast:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,6 +1853,20 @@ define float @v_log2_f32_fast(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_fast:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1703,10 +1886,12 @@ define float @v_log2_f32_fast(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1740,19 +1925,19 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
; VI-SDAG: ; %bb.0:
@@ -1768,6 +1953,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1782,6 +1981,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1801,10 +2014,12 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1838,19 +2053,19 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_approx_fn_attr:
; VI-SDAG: ; %bb.0:
@@ -1866,6 +2081,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_approx_fn_attr:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1880,6 +2109,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1899,10 +2142,12 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -1936,19 +2181,19 @@ define float @v_log2_f32_ninf(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_ninf:
; VI-SDAG: ; %bb.0:
@@ -1964,6 +2209,20 @@ define float @v_log2_f32_ninf(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_ninf:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1978,6 +2237,20 @@ define float @v_log2_f32_ninf(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_ninf:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1997,10 +2270,12 @@ define float @v_log2_f32_ninf(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2034,19 +2309,19 @@ define float @v_log2_f32_afn(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_afn:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_afn:
; VI-SDAG: ; %bb.0:
@@ -2062,6 +2337,20 @@ define float @v_log2_f32_afn(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2076,6 +2365,20 @@ define float @v_log2_f32_afn(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_afn:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_afn:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2095,10 +2398,12 @@ define float @v_log2_f32_afn(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2158,19 +2463,19 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_afn_dynamic:
; VI-SDAG: ; %bb.0:
@@ -2186,6 +2491,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_afn_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2200,6 +2519,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2219,10 +2552,12 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2256,19 +2591,19 @@ define float @v_fabs_log2_f32_afn(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_fabs_log2_f32_afn:
; VI-SDAG: ; %bb.0:
@@ -2284,6 +2619,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_fabs_log2_f32_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2298,6 +2647,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_fabs_log2_f32_afn:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2318,10 +2681,11 @@ define float @v_fabs_log2_f32_afn(float %in) {
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0|
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, |v0|, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2382,19 +2746,19 @@ define float @v_log2_f32_nnan(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_nnan:
; VI-SDAG: ; %bb.0:
@@ -2410,6 +2774,20 @@ define float @v_log2_f32_nnan(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_nnan:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_nnan:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2424,6 +2802,20 @@ define float @v_log2_f32_nnan(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_nnan:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2443,10 +2835,12 @@ define float @v_log2_f32_nnan(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2506,19 +2900,19 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_nnan_dynamic:
; VI-SDAG: ; %bb.0:
@@ -2534,6 +2928,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_nnan_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2548,6 +2956,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2567,10 +2989,12 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2630,19 +3054,19 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_ninf_dynamic:
; VI-SDAG: ; %bb.0:
@@ -2658,6 +3082,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_ninf_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2672,6 +3110,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2691,10 +3143,12 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2728,19 +3182,19 @@ define float @v_log2_f32_nnan_ninf(float %in) {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_nnan_ninf:
; VI-SDAG: ; %bb.0:
@@ -2756,6 +3210,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2770,6 +3238,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2789,10 +3271,12 @@ define float @v_log2_f32_nnan_ninf(float %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2852,19 +3336,19 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
; VI-SDAG: ; %bb.0:
@@ -2880,6 +3364,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2894,6 +3392,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2913,10 +3425,12 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -2976,19 +3490,19 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode:
-; GFX689-GISEL: ; %bb.0:
-; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_log2_f32_dynamic_mode:
; VI-SDAG: ; %bb.0:
@@ -3004,6 +3518,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; VI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-SDAG-LABEL: v_log2_f32_dynamic_mode:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3018,6 +3546,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
+; GFX900-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode:
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3037,10 +3579,12 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 4de0c548ad38..795ed6d542a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -3,6 +3,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
declare half @llvm.rint.f16(half %a)
declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -63,6 +64,24 @@ define amdgpu_kernel void @rint_f16(
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rint_f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_rndne_f16_e32 v0, v0
+; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -168,6 +187,28 @@ define amdgpu_kernel void @rint_v2f16(
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rint_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT: v_rndne_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_rndne_f16_e32 v1, v1
+; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 2bb89fdabda7..6927636ad04a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -4,6 +4,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX6-LABEL: sin_f16:
@@ -80,6 +81,19 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_sin_f16_e32 v1, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sin_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sin_f16_e32 v1, v1
+; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%a.val = load half, ptr addrspace(1) %a
%r.val = call half @llvm.sin.f16(half %a.val)
store half %r.val, ptr addrspace(1) %r
@@ -188,6 +202,24 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: sin_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
+; GFX12-NEXT: v_sin_f16_e32 v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: v_sin_f16_e32 v2, v2
+; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%a.val = load <2 x half>, ptr addrspace(1) %a
%r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)
store <2 x half> %r.val, ptr addrspace(1) %r
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 47777e3853e8..0d58afd1812d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -2,6 +2,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
declare half @llvm.trunc.f16(half %a)
declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
@@ -62,6 +63,24 @@ define amdgpu_kernel void @trunc_f16(
; GFX11-NEXT: v_trunc_f16_e32 v0, v0
; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: trunc_f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_trunc_f16_e32 v0, v0
+; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -147,6 +166,28 @@ define amdgpu_kernel void @trunc_v2f16(
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: trunc_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s2
+; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-NEXT: v_trunc_f16_e32 v0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_trunc_f16_e32 v1, v1
+; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 33007e5b285d..3be17f9538d0 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1333,5 +1333,668 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
ret i48 %a
}
+define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_1:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_movk_i32 s4, 0xfc19
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_1:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xfc19
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v3, v1, s4
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0xfc19
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xfffffffffffffc19
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_2:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_movk_i32 s4, 0xd1
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_2:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xd1
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v3, v1, s4
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0xd1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xffffffff000000d1
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_3:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_movk_i32 s4, 0xfc88
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_3:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xfc88
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v3, v1, s4
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0xfc88
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_3:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 s0xfffffffffffffc88, %lsh
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_4:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_mul_lo_u32 v3, v2, v0
+; CI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0
+; CI-NEXT: s_movk_i32 s4, 0xfc88
+; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_4:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mul_lo_u32 v2, v2, v0
+; SI-NEXT: v_mul_hi_u32 v3, v1, v0
+; SI-NEXT: s_movk_i32 s4, 0xfc88
+; SI-NEXT: v_mul_lo_u32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; SI-NEXT: v_mul_hi_u32 v3, v2, s4
+; SI-NEXT: v_mul_lo_u32 v1, v2, s4
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-NEXT: s_movk_i32 s4, 0xfc88
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5]
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_4:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ext = zext i32 %arg0 to i64
+ %mul1 = mul i64 %arg1, %ext
+ %lsh = lshr i64 %mul1, 32
+ %mul2 = mul i64 %lsh, s0xfffffffffffffc88
+ %mad = add i64 %mul2, %mul1
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_1:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; CI-NEXT: s_movk_i32 s4, 0xfc19
+; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_1:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; SI-NEXT: s_movk_i32 s4, 0xfc19
+; SI-NEXT: v_mul_lo_u32 v3, v2, s4
+; SI-NEXT: v_mul_hi_i32 v2, v2, s4
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; GFX9-NEXT: s_movk_i32 s4, 0xfc19
+; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_1:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_lshrrev_b32_e32 v4, 4, v1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1]
+; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_1:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX1150-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 36
+ %mul = mul i64 %lsh, s0xfffffffffffffc19
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_2:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_movk_i32 s4, 0xd1
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v0
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_2:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xd1
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v4, v1, s4
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v1
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
+; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0xd1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; GFX9-NEXT: v_sub_u32_e32 v1, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xffffff00000000d1
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_3:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22
+; CI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; CI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_3:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22
+; SI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_3:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %op = add i64 %arg0, 1
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xfffffffffffffc00
+ %mad = add i64 %mul, %op
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_4:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; CI-NEXT: v_mul_lo_u32 v0, v1, v1
+; CI-NEXT: v_add_i32_e32 v1, vcc, v0, v3
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_4:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mul_hi_u32 v2, v1, v0
+; SI-NEXT: v_mul_lo_u32 v3, v1, v1
+; SI-NEXT: v_mul_lo_u32 v4, v1, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_4:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_4:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_mov_b32_e32 v0, v4
+; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1150-NEXT: v_mov_b32_e32 v0, v3
+; GFX1150-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, %arg0
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_sgpr:
+; CI: ; %bb.0:
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v2, 0xffff1c18
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s1, v2, v[0:1]
+; CI-NEXT: v_subrev_i32_e32 v1, vcc, s1, v1
+; CI-NEXT: v_readfirstlane_b32 s0, v0
+; CI-NEXT: v_readfirstlane_b32 s1, v1
+; CI-NEXT: ; return to shader part epilog
+;
+; SI-LABEL: lshr_mad_i64_sgpr:
+; SI: ; %bb.0:
+; SI-NEXT: v_mov_b32_e32 v0, 0xffff1c18
+; SI-NEXT: v_mul_hi_u32 v0, s1, v0
+; SI-NEXT: s_mul_i32 s2, s1, 0xffff1c18
+; SI-NEXT: v_readfirstlane_b32 s3, v0
+; SI-NEXT: s_sub_i32 s3, s3, s1
+; SI-NEXT: s_add_u32 s0, s2, s0
+; SI-NEXT: s_addc_u32 s1, s3, s1
+; SI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: lshr_mad_i64_sgpr:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX9-NEXT: s_sub_i32 s2, s2, s1
+; GFX9-NEXT: s_mul_i32 s3, s1, 0xffff1c18
+; GFX9-NEXT: s_add_u32 s0, s3, s0
+; GFX9-NEXT: s_addc_u32 s1, s2, s1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: lshr_mad_i64_sgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mul_hi_u32 s2, s1, 0xffff1c18
+; GFX11-NEXT: s_mul_i32 s3, s1, 0xffff1c18
+; GFX11-NEXT: s_sub_i32 s2, s2, s1
+; GFX11-NEXT: s_add_u32 s0, s3, s0
+; GFX11-NEXT: s_addc_u32 s1, s2, s1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: lshr_mad_i64_sgpr:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 s4, 0xffff1c18
+; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_mov_b32 s5, -1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT: ; return to shader part epilog
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xffffffffffff1c18
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_vec:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_mov_b32 s4, 0xffff1c18
+; CI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: s_mov_b32 s4, 0xffff1118
+; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
+; CI-NEXT: v_mov_b32_e32 v0, v4
+; CI-NEXT: v_mov_b32_e32 v2, v6
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_vec:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, 0xffff1118
+; SI-NEXT: v_mul_lo_u32 v4, v3, s4
+; SI-NEXT: v_mul_hi_u32 v5, v3, s4
+; SI-NEXT: s_mov_b32 s4, 0xffff1c18
+; SI-NEXT: v_mul_hi_u32 v6, v1, s4
+; SI-NEXT: v_mul_lo_u32 v7, v1, s4
+; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3
+; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_vec:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0xffff1c18
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: s_mov_b32 s4, 0xffff1118
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
+; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_vec:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, v7, v3
+; GFX11-NEXT: v_mov_b32_e32 v2, v6
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_vec:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
+; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
+; GFX12-NEXT: v_mov_b32_e32 v2, v6
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
+ %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
+ %mad = add <2 x i64> %mul, %arg0
+
+ ret <2 x i64> %mad
+}
+
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 5e46fd6b28d2..fa15a42aef2a 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -1838,11 +1838,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximumnum_v3f16:
@@ -1904,8 +1904,8 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximumnum_v3f16_nnan:
@@ -1947,20 +1947,20 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
; GFX8-LABEL: v_maximumnum_v4f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximumnum_v4f16:
@@ -2020,12 +2020,12 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
; GFX8-LABEL: v_maximumnum_v4f16_nnan:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximumnum_v4f16_nnan:
@@ -2067,27 +2067,27 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
; GFX8-LABEL: v_maximumnum_v6f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v5
; GFX8-NEXT: v_max_f16_e32 v1, v1, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v4
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximumnum_v6f16:
@@ -2159,34 +2159,34 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
; GFX8-LABEL: v_maximumnum_v8f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v7
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v6
; GFX8-NEXT: v_max_f16_e32 v1, v1, v5
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v5
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximumnum_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 9e0b7daf38de..f5fb85d63b8e 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -1792,11 +1792,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_min_f16_e32 v1, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minimumnum_v3f16:
@@ -1858,8 +1858,8 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minimumnum_v3f16_nnan:
@@ -1901,20 +1901,20 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
; GFX8-LABEL: v_minimumnum_v4f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minimumnum_v4f16:
@@ -1974,12 +1974,12 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
; GFX8-LABEL: v_minimumnum_v4f16_nnan:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT: v_min_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minimumnum_v4f16_nnan:
@@ -2021,27 +2021,27 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
; GFX8-LABEL: v_minimumnum_v6f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v5
; GFX8-NEXT: v_min_f16_e32 v1, v1, v4
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_min_f16_e32 v2, v2, v4
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minimumnum_v6f16:
@@ -2113,34 +2113,34 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
; GFX8-LABEL: v_minimumnum_v8f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v4
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_min_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v3, v3, v7
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v6
; GFX8-NEXT: v_min_f16_e32 v1, v1, v5
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT: v_min_f16_e32 v2, v2, v5
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT: v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_min_f16_e32 v3, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_minimumnum_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 73f3d4c037ad..774a22fb907d 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s
define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_minmax_i32:
@@ -8,6 +10,16 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
%sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
ret i32 %sminmax
@@ -45,6 +57,16 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
%sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
ret i32 %sminmax
@@ -56,6 +78,16 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
%smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
ret i32 %smaxmin
@@ -67,6 +99,16 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
%smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
ret i32 %smaxmin
@@ -79,6 +121,17 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
; GFX11-NEXT: v_med3_i32 v2, v2, v3, v4
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_smed3_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_med3_i32 v2, v2, v3, v4
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
%tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
%tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
@@ -93,6 +146,16 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
%uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
ret i32 %uminmax
@@ -130,6 +193,16 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
%uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
ret i32 %uminmax
@@ -141,6 +214,16 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
%umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
ret i32 %umaxmin
@@ -152,6 +235,16 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
%umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
ret i32 %umaxmin
@@ -164,6 +257,17 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
; GFX11-NEXT: v_med3_u32 v2, v2, v3, v4
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_umed3_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_med3_u32 v2, v2, v3, v4
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
%tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
%tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
@@ -173,44 +277,88 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
}
define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_minmax_f32_ieee_true:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT: v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT: v_maxmin_f32 v0, v0, v1, v2
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_minmax_f32_ieee_true:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT: v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT: v_maxmin_f32 v0, v0, v1, v2
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maxnum.f32(float %a, float %b)
%minmax = call float @llvm.minnum.f32(float %max, float %c)
ret float %minmax
}
define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f32_ieee_false:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT: s_mov_b32 s5, s4
-; SDAG-NEXT: s_mov_b32 s4, s3
-; SDAG-NEXT: v_maxmin_f32 v0, s0, s1, v0
-; SDAG-NEXT: global_store_b32 v1, v0, s[4:5]
-; SDAG-NEXT: s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT: v_maxmin_f32 v0, s0, s1, v0
+; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT: s_endpgm
;
-; GISEL-LABEL: s_test_minmax_f32_ieee_false:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT: s_mov_b32 s6, s3
-; GISEL-NEXT: s_mov_b32 s7, s4
-; GISEL-NEXT: v_maxmin_f32 v0, s0, s1, v0
-; GISEL-NEXT: global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT: s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT: v_maxmin_f32 v0, s0, s1, v0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT: s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, s0, s1, v0
+; SDAG-GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT: s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_max_num_f32 s0, s0, s1
+; GISEL-GFX12-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT: s_min_num_f32 s0, s0, s2
+; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT: s_endpgm
%smax = call float @llvm.maxnum.f32(float %a, float %b)
%sminmax = call float @llvm.minnum.f32(float %smax, float %c)
store float %sminmax, ptr addrspace(1) %out
@@ -222,27 +370,56 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
; GFX11: ; %bb.0:
; GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float %a, float %b)
%minmax = call float @llvm.minnum.f32(float %c, float %max)
ret float %minmax
}
define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_maxmin_f32_ieee_true:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT: v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT: v_minmax_f32 v0, v0, v1, v2
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_maxmin_f32_ieee_true:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT: v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT: v_minmax_f32 v0, v0, v1, v2
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minnum.f32(float %a, float %b)
%maxmin = call float @llvm.maxnum.f32(float %min, float %c)
ret float %maxmin
@@ -253,6 +430,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
; GFX11: ; %bb.0:
; GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float %a, float %b)
%maxmin = call float @llvm.maxnum.f32(float %c, float %min)
ret float %maxmin
@@ -265,6 +447,17 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call float @llvm.minnum.f32(float %x, float %y)
%tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
@@ -278,29 +471,54 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_f16_ieee_false:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: ; return to shader part epilog
%max = call half @llvm.maxnum.f16(half %a, half %b)
%minmax = call half @llvm.minnum.f16(half %max, half %c)
ret half %minmax
}
define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT: s_mov_b32 s5, s4
-; SDAG-NEXT: s_mov_b32 s4, s3
-; SDAG-NEXT: v_maxmin_f16 v0, s0, s1, v0
-; SDAG-NEXT: global_store_b16 v1, v0, s[4:5]
-; SDAG-NEXT: s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0
+; SDAG-GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT: s_endpgm
;
-; GISEL-LABEL: s_test_minmax_f16_ieee_false:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT: s_mov_b32 s6, s3
-; GISEL-NEXT: s_mov_b32 s7, s4
-; GISEL-NEXT: v_maxmin_f16 v0, s0, s1, v0
-; GISEL-NEXT: global_store_b16 v1, v0, s[6:7]
-; GISEL-NEXT: s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0
+; GISEL-GFX11-NEXT: global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT: s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, s0, s1, v0
+; SDAG-GFX12-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT: s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_max_num_f16 s0, s0, s1
+; GISEL-GFX12-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT: s_min_num_f16 s0, s0, s2
+; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT: global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT: s_endpgm
%smax = call half @llvm.maxnum.f16(half %a, half %b)
%sminmax = call half @llvm.minnum.f16(half %smax, half %c)
store half %sminmax, ptr addrspace(1) %out
@@ -308,23 +526,49 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
}
define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT: v_maxmin_f16 v0, v0, v1, v2
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT: v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT: v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT: v_maxmin_f16 v0, v0, v1, v2
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %b)
%minmax = call half @llvm.minnum.f16(half %c, half %max)
ret half %minmax
@@ -335,29 +579,60 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_f16_ieee_false:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: ; return to shader part epilog
%min = call half @llvm.minnum.f16(half %a, half %b)
%maxmin = call half @llvm.maxnum.f16(half %min, half %c)
ret half %maxmin
}
define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT: v_minmax_f16 v0, v0, v1, v2
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT: v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT: v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT: v_minmax_f16 v0, v0, v1, v2
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12: ; %bb.0:
+; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %b)
%maxmin = call half @llvm.maxnum.f16(half %c, half %min)
ret half %maxmin
@@ -370,6 +645,17 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; GFX11-NEXT: v_med3_f16 v2, v2, v3, v4
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_med3_num_f16 v2, v2, v3, v4
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call half @llvm.minnum.f16(half %x, half %y)
%tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
%tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
new file mode 100644
index 000000000000..a9b8663a48de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908
+
+define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
+; GFX942-LABEL: matmul_kernel:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_mov_b32 s2, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX942-NEXT: s_mov_b32 s3, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX942-NEXT: s_branch .LBB0_2
+; GFX942-NEXT: .LBB0_1: ; %bb2
+; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; GFX942-NEXT: s_or_b32 s4, s3, 1
+; GFX942-NEXT: s_ashr_i32 s5, s3, 31
+; GFX942-NEXT: s_mov_b32 s3, s2
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: s_and_b32 s3, s5, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX942-NEXT: s_cbranch_execz .LBB0_4
+; GFX942-NEXT: .LBB0_2: ; %bb
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_vccz .LBB0_1
+; GFX942-NEXT: ; %bb.3:
+; GFX942-NEXT: ; implicit-def: $sgpr3
+; GFX942-NEXT: .LBB0_4: ; %common.ret
+; GFX942-NEXT: s_endpgm
+;
+; GFX908-LABEL: matmul_kernel:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v1, 0
+; GFX908-NEXT: s_mov_b32 s2, 0
+; GFX908-NEXT: s_mov_b32 s3, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX908-NEXT: s_branch .LBB0_2
+; GFX908-NEXT: .LBB0_1: ; %bb2
+; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; GFX908-NEXT: s_or_b32 s4, s3, 1
+; GFX908-NEXT: s_ashr_i32 s5, s3, 31
+; GFX908-NEXT: s_mov_b32 s3, s2
+; GFX908-NEXT: s_nop 3
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_mov_b32_e32 v5, s3
+; GFX908-NEXT: v_mov_b32_e32 v4, s2
+; GFX908-NEXT: v_mov_b32_e32 v2, v1
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: s_and_b32 s3, s5, s4
+; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX908-NEXT: s_cbranch_execz .LBB0_4
+; GFX908-NEXT: .LBB0_2: ; %bb
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT: s_cbranch_vccz .LBB0_1
+; GFX908-NEXT: ; %bb.3:
+; GFX908-NEXT: ; implicit-def: $sgpr3
+; GFX908-NEXT: .LBB0_4: ; %common.ret
+; GFX908-NEXT: s_endpgm
+entry:
+ br label %bb
+
+bb:
+ %i = phi { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } [ %i10, %bb2 ], [ zeroinitializer, %entry ]
+ %i1 = phi i32 [ %i5, %bb2 ], [ 0, %entry ]
+ %c0 = icmp ne i32 %a0, 0
+ br i1 %c0, label %bb2, label %bb11
+
+bb2:
+ %i3 = or i32 %i1, 1
+ %i4 = icmp slt i32 %i1, 0
+ %i5 = select i1 %i4, i32 %i3, i32 0
+ %i6 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 123
+ %i7 = insertelement <4 x float> zeroinitializer, float %i6, i32 0
+ %i8 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0)
+ %i9 = extractelement <4 x float> %i8, i32 0
+ %i10 = insertvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } zeroinitializer, float %i9, 123
+ br label %bb
+
+bb11:
+ %c1 = icmp ne i32 %a1, 0
+ br i1 %c1, label %bb12, label %common.ret
+
+common.ret:
+ ret void
+
+bb12:
+ %i13 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 0
+ %i14 = insertelement <4 x float> zeroinitializer, float %i13, i32 0
+ %i15 = insertelement <4 x float> %i14, float 0.000000e+00, i32 0
+ %i16 = insertelement <4 x float> %i15, float 0.000000e+00, i32 0
+ br label %common.ret
+}
+
+; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
new file mode 100644
index 000000000000..5c83170563e5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -0,0 +1,235 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name: test
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; CHECK-NEXT: S_BITCMP1_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B32_e32_]]
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+ ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 1, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:agpr_32 = PHI [[COPY1]], %bb.0, %24, %bb.3
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %11, %bb.3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+ ; CHECK-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], 1, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[PHI1]], 31, implicit-def dead $scc
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_ASHR_I32_]], killed [[S_OR_B32_]], implicit-def dead $scc
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, [[S_AND_B32_]], %bb.2
+ ; CHECK-NEXT: [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_]].sub0, %bb.2
+ ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.1, [[S_MOV_B64_1]], %bb.2
+ ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI4]], implicit $exec
+ ; CHECK-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_1]], 1, implicit $exec
+ ; CHECK-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: successors: %bb.5(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5:
+ ; CHECK-NEXT: S_ENDPGM 0
+ ;
+ ; COALESCE-LABEL: name: test
+ ; COALESCE: bb.0:
+ ; COALESCE-NEXT: successors: %bb.1(0x80000000)
+ ; COALESCE-NEXT: liveins: $sgpr4_sgpr5
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+ ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+ ; COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+ ; COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+ ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+ ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+ ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.1:
+ ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+ ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+ ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+ ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+ ; COALESCE-NEXT: S_BRANCH %bb.2
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.2:
+ ; COALESCE-NEXT: successors: %bb.3(0x80000000)
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+ ; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+ ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+ ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+ ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+ ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+ ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+ ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+ ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.3:
+ ; COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+ ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+ ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+ ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ ; COALESCE-NEXT: S_BRANCH %bb.4
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.4:
+ ; COALESCE-NEXT: successors: %bb.5(0x80000000)
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.5:
+ ; COALESCE-NEXT: S_ENDPGM 0
+ ;
+ ; GFX908-COALESCE-LABEL: name: test
+ ; GFX908-COALESCE: bb.0:
+ ; GFX908-COALESCE-NEXT: successors: %bb.1(0x80000000)
+ ; GFX908-COALESCE-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX908-COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; GFX908-COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+ ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+ ; GFX908-COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+ ; GFX908-COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.1:
+ ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+ ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+ ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+ ; GFX908-COALESCE-NEXT: S_BRANCH %bb.2
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.2:
+ ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+ ; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+ ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+ ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+ ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+ ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+ ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.3:
+ ; GFX908-COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+ ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+ ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ ; GFX908-COALESCE-NEXT: S_BRANCH %bb.4
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.4:
+ ; GFX908-COALESCE-NEXT: successors: %bb.5(0x80000000)
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.5:
+ ; GFX908-COALESCE-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+ liveins: $sgpr4_sgpr5
+
+ %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ S_BITCMP1_B32 killed %1, 0, implicit-def $scc
+ %2:sgpr_32 = S_MOV_B32 0
+ %3:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_32 = IMPLICIT_DEF
+ %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %3, implicit $exec
+ %7:sreg_64_xexec = V_CMP_NE_U32_e64 %6, 1, implicit $exec
+
+ bb.1:
+ successors: %bb.2, %bb.3
+
+ %8:vgpr_32 = PHI %4, %bb.0, %9, %bb.3
+ %10:sreg_32 = PHI %2, %bb.0, %11, %bb.3
+ %12:agpr_32 = COPY %8
+ %13:sreg_64 = S_MOV_B64 -1
+ $vcc = S_AND_B64 $exec, %7, implicit-def $scc
+ S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.3
+
+ %14:sreg_32 = S_OR_B32 %10, 1, implicit-def dead $scc
+ %15:sreg_32 = S_ASHR_I32 %10, 31, implicit-def dead $scc
+ %16:sreg_32 = S_AND_B32 killed %15, killed %14, implicit-def dead $scc
+ %17:vreg_128_align2 = REG_SEQUENCE %8, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3
+ %18:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1
+ %19:vreg_64_align2 = COPY %18
+ %20:areg_128_align2 = COPY %17
+ %21:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %19, %19, killed %20, 0, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = COPY %21.sub0
+ %23:sreg_64 = S_MOV_B64 0
+
+ bb.3:
+ successors: %bb.4, %bb.1
+
+ %11:sreg_32 = PHI %5, %bb.1, %16, %bb.2
+ %24:agpr_32 = PHI %12, %bb.1, %21.sub0, %bb.2
+ %25:sreg_64_xexec = PHI %13, %bb.1, %23, %bb.2
+ %9:vgpr_32 = COPY %24
+ %26:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %25, implicit $exec
+ %27:sreg_64_xexec = V_CMP_NE_U32_e64 %26, 1, implicit $exec
+ $vcc = S_AND_B64 $exec, %27, implicit-def $scc
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.4
+
+ bb.4:
+ successors: %bb.5
+
+ bb.5:
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
new file mode 100644
index 000000000000..49c0aaf9fb39
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -0,0 +1,182 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name: test
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; CHECK-NEXT: S_BITCMP0_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_4:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_4]], %subreg.sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_3:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MFMA_F32_16X16X16F16_e64_3]].sub0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:agpr_32 = PHI [[V_ACCVGPR_WRITE_B32_e64_]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_3]].sub0, %bb.2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+ ; CHECK-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_PACK_B32_F16_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+ ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; CHECK-NEXT: S_ENDPGM 0
+ ;
+ ; COALESCE-LABEL: name: test
+ ; COALESCE: bb.0:
+ ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; COALESCE-NEXT: liveins: $sgpr4_sgpr5
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+ ; COALESCE-NEXT: S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+ ; COALESCE-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.1:
+ ; COALESCE-NEXT: successors: %bb.3(0x80000000)
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; COALESCE-NEXT: S_BRANCH %bb.3
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.2:
+ ; COALESCE-NEXT: successors: %bb.3(0x80000000)
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+ ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: bb.3:
+ ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+ ; COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+ ; COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+ ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; COALESCE-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; COALESCE-NEXT: S_ENDPGM 0
+ ;
+ ; GFX908-COALESCE-LABEL: name: test
+ ; GFX908-COALESCE: bb.0:
+ ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX908-COALESCE-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX908-COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+ ; GFX908-COALESCE-NEXT: S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+ ; GFX908-COALESCE-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.1:
+ ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; GFX908-COALESCE-NEXT: S_BRANCH %bb.3
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.2:
+ ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000)
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+ ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+ ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+ ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: bb.3:
+ ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+ ; GFX908-COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; GFX908-COALESCE-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+ ; GFX908-COALESCE-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.2, %bb.1
+ liveins: $sgpr4_sgpr5
+
+ %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ %2:sgpr_32 = S_MOV_B32 0
+ S_BITCMP0_B32 killed %1, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+
+ bb.1:
+ successors: %bb.3
+
+ %3:sgpr_32 = COPY %2
+ %4:vgpr_32 = COPY %3, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ successors: %bb.3
+
+ %5:sgpr_32 = S_MOV_B32 0
+ %6:vgpr_32 = COPY %5
+ %7:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+ %8:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+ %9:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+ %10:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+ %11:areg_128_align2 = REG_SEQUENCE %7, %subreg.sub0, %8, %subreg.sub1, %9, %subreg.sub2, %10, %subreg.sub3
+ %12:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %5, %subreg.sub1
+ %13:vreg_64_align2 = COPY %12
+ %14:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %11, 0, 0, 0, implicit $mode, implicit $exec
+ %15:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %14, 0, 0, 0, implicit $mode, implicit $exec
+ %16:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %15, 0, 0, 0, implicit $mode, implicit $exec
+ %17:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %16, 0, 0, 0, implicit $mode, implicit $exec
+ %18:vgpr_32 = COPY %17.sub0
+ %19:vgpr_32 = COPY %18
+
+ bb.3:
+ %20:vgpr_32 = PHI %4, %bb.1, %19, %bb.2
+ %21:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, %20, 0, 0, implicit $mode, implicit $exec
+ %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, %2, 0, 0, implicit $mode, implicit $exec
+ %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %24:vreg_64_align2 = REG_SEQUENCE %22, %subreg.sub0, killed %23, %subreg.sub1
+ %25:sgpr_128 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1, %2, %subreg.sub2, %2, %subreg.sub3
+ %26:vreg_64_align2 = COPY %24
+ BUFFER_STORE_DWORDX2_OFFSET_exact killed %26, killed %25, %2, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 98d5f3097153..a2a0107a6f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -1372,20 +1372,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; GFX8-NEXT: s_movk_i32 s0, 0xf000
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
; GFX8-NEXT: s_movk_i32 s0, 0xf800
-; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4]
-; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[3:4]
+; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8]
; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v3
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 1, v4, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v4
; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v5
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc
@@ -1416,32 +1415,32 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT: v_and_b32_e32 v12, 0xffff8000, v1
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff8000, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s35
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v12
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v10
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
; GFX9-NEXT: s_movk_i32 s0, 0xf000
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off
-; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[34:35]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: Offset64:
@@ -1477,8 +1476,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx2 v[10:11], v[0:1], off
@@ -1517,25 +1515,25 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
-; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b64 v[6:7], v[4:5], off offset:-4096
-; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
+; GFX11-NEXT: global_load_b64 v[4:5], v[2:3], off
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:2048
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
@@ -2408,18 +2406,17 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX8-NEXT: v_mov_b32_e32 v3, 3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GFX8-NEXT: s_movk_i32 s0, 0x800
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v6, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v0, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v0
; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; GFX8-NEXT: s_endpgm
;
@@ -2450,14 +2447,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX9-NEXT: v_mov_b32_e32 v3, 3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-2048
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
@@ -2490,15 +2486,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v3
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
@@ -2525,19 +2520,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048
-; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:-2048
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index ba428df273db..a439f8df10a2 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -3,32 +3,17 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
-; GFX12-SDAG-LABEL: v_s_exp_f32:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42800000, 0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-SDAG-NEXT: s_add_f32 s0, s0, s1
-; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0xffffffc0, 0
-; GFX12-SDAG-NEXT: v_s_exp_f32 s0, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT: v_ldexp_f32 v0, s0, s1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: v_s_exp_f32:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42800000, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT: s_add_f32 s0, s0, s1
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0
-; GFX12-GISEL-NEXT: v_s_exp_f32 s0, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX12-LABEL: v_s_exp_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT: s_add_f32 s0, s0, s1
+; GFX12-NEXT: s_cselect_b32 s1, 0xffffffc0, 0
+; GFX12-NEXT: v_s_exp_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT: v_ldexp_f32 v0, s0, s1
+; GFX12-NEXT: ; return to shader part epilog
%result = call float @llvm.exp2.f32(float %src)
ret float %result
}
@@ -88,16 +73,16 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
; GFX12-GISEL-LABEL: v_s_log_f32:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT: v_ldexp_f32 v0, s0, s2
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%result = call float @llvm.log2.f32(float %src)
ret float %result
@@ -322,19 +307,18 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
;
; GFX12-GISEL-LABEL: srcmods_abs_f32:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_bitset0_b32 s0, 31
+; GFX12-GISEL-NEXT: s_and_b32 s1, s0, 0x7fffffff
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT: s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT: v_ldexp_f32 v0, |s0|, s2
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%abs = call float @llvm.fabs.f32(float %src)
%result = call float @llvm.log2.f32(float %abs)
@@ -362,19 +346,18 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
;
; GFX12-GISEL-LABEL: srcmods_neg_f32:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX12-GISEL-NEXT: s_xor_b32 s1, s0, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT: s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT: s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT: v_ldexp_f32 v0, -s0, s2
+; GFX12-GISEL-NEXT: s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_subrev_f32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%neg = fneg float %src
%result = call float @llvm.log2.f32(float %neg)
diff --git a/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
new file mode 100644
index 000000000000..3879f6dccf9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# This used to assert due to trying to rematerialize V_MOV_B64_PSEUDO
+# at copy to $vgpr1. This would assert since this would clobber the
+# live value in $vgpr0.
+
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = COPY [[V_MOV_B]].sub1
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+ %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+ %1:vgpr_32 = COPY %0.sub1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr1 = COPY %1
+ SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+# Same as previous, except with an IMPLICIT_DEF
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = COPY [[DEF]].sub1
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+ %0:vreg_64 = IMPLICIT_DEF
+ %1:vgpr_32 = COPY %0.sub1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr1 = COPY %1
+ SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_0
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+ ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+ %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+ %1:vgpr_32 = COPY %0.sub1
+ $vgpr1 = COPY %1
+ SI_RETURN implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_1
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: dead $vgpr1_vgpr2 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+ ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+ %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+ %1:vgpr_32 = COPY %0.sub0
+ $vgpr1 = COPY %1
+ SI_RETURN implicit killed $vgpr1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 3e8768c98b5c..96dd6276f7e3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1065,100 +1065,37 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_sdiv24_48:
-; GCN-IR: ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT: s_mov_b32 s15, 0
-; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_sext_i32_i16 s1, s1
-; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT: s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 16
-; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT: s_mov_b32 s1, s0
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 16
-; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1]
-; GCN-IR-NEXT: s_mov_b32 s3, s2
-; GCN-IR-NEXT: s_sub_u32 s12, s6, s0
-; GCN-IR-NEXT: s_subb_u32 s13, s7, s0
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT: s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
-; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[12:13]
-; GCN-IR-NEXT: s_sub_u32 s16, s14, s20
-; GCN-IR-NEXT: s_subb_u32 s17, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
-; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19]
-; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec
-; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13
-; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12
-; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23]
-; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s18, s16, 1
-; GCN-IR-NEXT: s_addc_u32 s19, s17, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s18
-; GCN-IR-NEXT: s_add_u32 s18, s6, -1
-; GCN-IR-NEXT: s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT: s_add_u32 s12, s8, s20
-; GCN-IR-NEXT: s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT: s_mov_b32 s9, 0
-; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11]
-; GCN-IR-NEXT: s_sub_u32 s8, s18, s16
-; GCN-IR-NEXT: s_subb_u32 s8, s19, s17
-; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31
-; GCN-IR-NEXT: s_mov_b32 s15, s14
-; GCN-IR-NEXT: s_and_b32 s8, s14, 1
-; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s16, s16, s14
-; GCN-IR-NEXT: s_subb_u32 s17, s17, s15
-; GCN-IR-NEXT: s_add_u32 s12, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9]
-; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT: .LBB9_4: ; %Flow4
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT: .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1]
-; GCN-IR-NEXT: s_sub_u32 s0, s2, s0
-; GCN-IR-NEXT: s_subb_u32 s1, s3, s1
+; GCN-IR: ; %bb.0:
+; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT: s_waitcnt expcnt(0)
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT: s_mov_b32 s5, s1
+; GCN-IR-NEXT: s_sext_i32_i16 s1, s9
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT: v_alignbit_b32 v0, s1, v0, 24
+; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT: s_mov_b32 s4, s0
+; GCN-IR-NEXT: s_sext_i32_i16 s0, s3
+; GCN-IR-NEXT: v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT: v_alignbit_b32 v2, s0, v2, 24
+; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0
+; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0
+; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v4
+; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, v3
+; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-IR-NEXT: s_endpgm
%1 = ashr i48 %x, 24
%2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index e0d0ddce208c..ddf6297bc27a 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -1,6 +1,8 @@
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -98,6 +100,8 @@ declare i64 @llvm.smin.i64(i64, i64)
; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]]
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -686,6 +690,8 @@ bb:
; VI: v_max_i16
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_smed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -707,6 +713,8 @@ bb:
; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1:
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_smed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index cb8f82db92bb..23364e860d15 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1188,109 +1188,39 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_srem24_48:
-; GCN-IR: ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT: s_mov_b32 s13, 0
+; GCN-IR: ; %bb.0:
+; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_sext_i32_i16 s1, s1
; GCN-IR-NEXT: s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], 16
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[0:1], 16
-; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT: s_mov_b32 s1, s0
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 16
-; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-IR-NEXT: s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT: s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31
-; GCN-IR-NEXT: s_mov_b32 s11, s10
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11]
-; GCN-IR-NEXT: s_sub_u32 s6, s6, s10
-; GCN-IR-NEXT: s_subb_u32 s7, s7, s10
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[6:7]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[2:3]
-; GCN-IR-NEXT: s_sub_u32 s14, s12, s20
-; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17]
-; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec
-; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3
-; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2
-; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s16, s14, 1
-; GCN-IR-NEXT: s_addc_u32 s17, s15, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0
-; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s16
-; GCN-IR-NEXT: s_add_u32 s18, s6, -1
-; GCN-IR-NEXT: s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT: s_add_u32 s12, s8, s20
-; GCN-IR-NEXT: s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT: s_mov_b32 s9, 0
-; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[8:9]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT: s_sub_u32 s8, s18, s14
-; GCN-IR-NEXT: s_subb_u32 s8, s19, s15
-; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT: s_mov_b32 s17, s16
-; GCN-IR-NEXT: s_and_b32 s8, s16, 1
-; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT: s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT: s_add_u32 s12, s12, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9]
-; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT: .LBB9_4: ; %Flow4
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT: .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x9
-; GCN-IR-NEXT: s_mul_i32 s4, s6, s11
-; GCN-IR-NEXT: v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT: s_mul_i32 s4, s7, s10
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT: s_mul_i32 s4, s6, s10
-; GCN-IR-NEXT: v_mov_b32_e32 v1, s4
-; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0
-; GCN-IR-NEXT: v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1
-; GCN-IR-NEXT: s_mov_b32 s15, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s14, -1
-; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc
-; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4
-; GCN-IR-NEXT: buffer_store_dword v1, off, s[12:15], 0
+; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT: v_alignbit_b32 v0, s5, v0, 24
+; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT: v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT: v_alignbit_b32 v2, s3, v2, 24
+; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v5, v2, v0
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 30, v5
+; GCN-IR-NEXT: v_or_b32_e32 v5, 1, v5
+; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4
+; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT: v_mad_f32 v3, -v4, v1, v3
+; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4
+; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-IR-NEXT: s_mov_b32 s4, s0
+; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT: s_mov_b32 s5, s1
+; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-IR-NEXT: s_endpgm
%1 = ashr i48 %x, 24
%2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
new file mode 100644
index 000000000000..f52f1164f2ba
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit sub by a constant if we know the low 32-bits are all
+; zero.
+
+; sub i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (sub x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 0xfffc0000
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+ ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, -1
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+ ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, -2
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+ ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+ ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, 1
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+ ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 0xfffc0000, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+ ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+ ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, -2, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+ ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+ ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+ ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_u32 s0, s0, 1
+; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+ ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+ ret i64 %sub
+}
+
+define <2 x i64> @v_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT: v_add_u32_e32 v3, -1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+ ret <2 x i64> %sub
+}
+
+define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT: v_add_u32_e32 v3, -2, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+ ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, -1
+; GFX9-NEXT: s_add_i32 s3, s3, -1
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+ ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_add_i32 s1, s1, -1
+; GFX9-NEXT: s_add_i32 s3, s3, -2
+; GFX9-NEXT: ; return to shader part epilog
+ %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+ ret <2 x i64> %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define i64 @v_sub_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_sub_i64_variable_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+ %in.high.bits = shl i64 %zext.offset.hi32, 32
+ %sub = sub i64 %reg, %in.high.bits
+ ret i64 %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define amdgpu_ps i64 @s_sub_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_sub_i64_variable_high_bits_known0_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_sub_u32 s0, s0, 0
+; GFX9-NEXT: s_subb_u32 s1, s1, s2
+; GFX9-NEXT: ; return to shader part epilog
+ %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+ %in.high.bits = shl i64 %zext.offset.hi32, 32
+ %sub = sub i64 %reg, %in.high.bits
+ ret i64 %sub
+}
diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
new file mode 100644
index 000000000000..831570800d06
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=4 -verify-regalloc -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+# This testcase hit a situation where greedy would hit a use after
+# free during last chance recoloring. This case successfully allocates
+# after, but is extremely sensitive to the exact allocation ordering.
+
+---
+name: swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 16 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: swdev502267_use_after_free_last_chance_recoloring_alloc_succeeds
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 killed renamable $vgpr4_vgpr5, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr4 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr5 = V_FMA_F32_e64 0, $vgpr7, 0, $vgpr7, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SI_SPILL_AV128_SAVE $vgpr6_vgpr7_vgpr8_vgpr9, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr6 = V_FMA_F32_e64 0, killed $vgpr8, 0, $vgpr8, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SI_SPILL_V128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr5
+ ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
+ ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 killed renamable $vgpr4_vgpr5, renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+
+ %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, implicit $exec :: (volatile load (s128), addrspace 1)
+ undef %4.sub0:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub0, 0, %3.sub0, 0, %0.sub3, 0, 0, implicit $mode, implicit $exec
+ %4.sub1:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub1, 0, %3.sub1, 0, %0.sub2, 0, 0, implicit $mode, implicit $exec
+ %4.sub2:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub2, 0, %3.sub2, 0, %0.sub1, 0, 0, implicit $mode, implicit $exec
+ %4.sub3:vreg_128_align2 = IMPLICIT_DEF
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+ bb.1:
+ %5:vgpr_32 = V_FMA_F32_e64 0, %4.sub2, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec
+ %6:vgpr_32 = V_TRUNC_F32_e32 %5, implicit $mode, implicit $exec
+ undef %7.sub3:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub3, 0, %3.sub3, 0, 0, implicit $mode, implicit $exec
+ %7.sub2:vreg_128_align2 = nofpexcept V_FMA_F32_e64 1, %6, 0, %4.sub2, 0, %3.sub2, 0, 0, implicit $mode, implicit $exec
+ %7.sub0:vreg_128_align2 = nofpexcept V_DIV_FIXUP_F32_e64 0, %2, 0, %4.sub0, 0, %3.sub0, 0, 0, implicit $mode, implicit $exec
+ %8:vgpr_32 = V_FMA_F32_e64 0, %4.sub1, 0, %4.sub1, 0, %3.sub1, 0, 0, implicit $mode, implicit $exec
+ %9:vreg_128_align2 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
+ GLOBAL_STORE_DWORDX4 %1, %7, 0, 0, implicit $exec :: (volatile store (s128), addrspace 1)
+
+ bb.2:
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %0
+ SI_RETURN implicit $vgpr0_vgpr1_vgpr2_vgpr3
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
new file mode 100644
index 000000000000..f0b3d334af67
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
@@ -0,0 +1,23 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s
+; RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+; FIXME: This error will be fixed by supporting arbitrary divergent
+; dynamic allocas by performing a wave umax of the size.
+
+; ERR: error: <unknown>:0:0: in function move_to_valu_assert_srd_is_physreg_swdev503538 i32 (ptr addrspace(1)): illegal VGPR to SGPR copy
+
+; CHECK: ; illegal copy v0 to s32
+
+define i32 @move_to_valu_assert_srd_is_physreg_swdev503538(ptr addrspace(1) %ptr) {
+entry:
+ %idx = load i32, ptr addrspace(1) %ptr, align 4
+ %zero = extractelement <4 x i32> zeroinitializer, i32 %idx
+ %alloca = alloca [2048 x i8], i32 %zero, align 8, addrspace(5)
+ %ld = load i32, ptr addrspace(5) %alloca, align 8
+ call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 2048, i1 false)
+ ret i32 %ld
+}
+
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 557d023c45f9..4726e81ceb8c 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -1,6 +1,8 @@
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
@@ -84,6 +86,8 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr add
; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]]
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -707,6 +711,8 @@ bb:
; VI: v_max_u16
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_umed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -728,6 +734,8 @@ bb:
; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1:
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_umed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
index ad4ad6df73e7..b663acb8ce3f 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
@@ -67,7 +67,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAITCNT 0
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
- ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0_lo16, 16
+ ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0, 16
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
CFI_INSTRUCTION offset $vgpr0, 16