diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll | 769 |
1 files changed, 402 insertions, 367 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index 0577117e9d9e..d81faf91801b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -10,10 +10,10 @@ define float @v_pow_f32(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -25,19 +25,19 @@ define float @v_pow_f32(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -49,19 +49,19 @@ define float @v_pow_f32(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -73,17 +73,18 @@ define float @v_pow_f32(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -91,32 +92,34 @@ define float @v_pow_f32(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -127,111 +130,114 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 5, v4 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, 0x42000000 -; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc -; GFX6-NEXT: v_sub_f32_e32 v0, v0, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX6-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX6-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x42800000 +; GFX6-NEXT: v_mov_b32_e32 v6, 0x42800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5] +; GFX6-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5] ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_not_b32_e32 v4, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4 -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v4 ; GFX8-NEXT: v_log_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v4 ; GFX8-NEXT: v_log_f32_e32 v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, 0x42000000 -; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX8-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX8-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x42800000 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x42800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; GFX8-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5] +; GFX8-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5] ; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX8-NEXT: v_exp_f32_e32 v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_not_b32_e32 v4, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX8-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v4 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 1.0, v5, s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_ldexp_f32 v0, v0, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 5, v4 ; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_ldexp_f32 v1, v1, v4 ; GFX9-NEXT: v_log_f32_e32 v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x42000000 -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v6, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x42000000 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX9-NEXT: v_sub_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x42800000 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x42800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v6, vcc ; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[4:5] +; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[4:5] ; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 1.0, v4, s[4:5] -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_not_b32_e32 v4, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] +; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f32: @@ -239,10 +245,12 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 5, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 @@ -257,46 +265,54 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) { ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s4 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s4 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX10-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v1, v1, v5 ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: v_log_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v4, 5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s0 -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) ret <2 x float> %pow @@ -316,9 +332,9 @@ define half @v_pow_f16(half %x, half %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -388,18 +404,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc +; GFX6-NEXT: v_not_b32_e32 v3, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -508,17 +524,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_not_b32_e32 v5, 63 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_exp_f32_e32 v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v1, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -634,17 +650,17 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_not_b32_e32 v5, 63 ; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v6 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -764,17 +780,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, 0x1f800000 +; GFX6-NEXT: v_not_b32_e32 v5, 63 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v6, 1.0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v5, vcc ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: v_exp_f32_e32 v2, v2 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v1, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v5, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -885,10 +901,10 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -900,19 +916,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -924,19 +940,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_lhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -948,17 +964,18 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0| -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -966,9 +983,9 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_lhs: @@ -976,23 +993,24 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %pow = call float @llvm.pow.f32(float %fabs.x, float %y) @@ -1004,10 +1022,10 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1019,19 +1037,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1043,19 +1061,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1067,17 +1085,18 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1085,32 +1104,34 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %x, float %fabs.y) @@ -1122,10 +1143,10 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v0, |v0|, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1137,19 +1158,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1161,19 +1182,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1185,17 +1206,18 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, |v0| -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1203,9 +1225,9 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs: @@ -1213,23 +1235,24 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, |v0| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, |v0|, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) @@ -1241,10 +1264,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX6-LABEL: v_pow_f32_sgpr_vgpr: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v1, s0, v1 ; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1256,18 +1279,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_vgpr: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX8-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX8-NEXT: v_log_f32_e32 v1, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1279,18 +1302,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX9-NEXT: v_log_f32_e32 v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1302,49 +1325,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_vgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1 -; GFX10-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX10-NEXT: v_log_f32_e32 v1, v1 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1 -; GFX11-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v1, s0, v1 ; GFX11-NEXT: v_log_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1354,10 +1379,10 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX6-LABEL: v_pow_f32_vgpr_sgpr: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1369,18 +1394,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_vgpr_sgpr: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1392,18 +1417,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_vgpr_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1415,16 +1440,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_vgpr_sgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -1432,31 +1458,33 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_vgpr_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1466,10 +1494,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX6-LABEL: v_pow_f32_sgpr_sgpr: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX6-NEXT: v_ldexp_f32_e32 v0, s0, v0 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1481,18 +1509,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_pow_f32_sgpr_sgpr: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX8-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1504,18 +1532,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_pow_f32_sgpr_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX9-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1527,49 +1555,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_pow_f32_sgpr_sgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX10-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_pow_f32_sgpr_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s2 -; GFX11-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v0, s0, v0 ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow @@ -1580,10 +1610,10 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e64 v0, -v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1595,19 +1625,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1619,19 +1649,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fneg_lhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1643,17 +1673,18 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fneg_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, -v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4 -; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1661,9 +1692,9 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fneg_lhs: @@ -1671,23 +1702,24 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, -v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX11-NEXT: v_mul_f32_e64 v0, -v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ldexp_f32 v0, -v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %pow = call float @llvm.pow.f32(float %neg.x, float %y) @@ -1699,10 +1731,10 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX6-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX6-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1714,19 +1746,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_not_b32_e32 v1, 63 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f32_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1738,19 +1770,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v1, 63 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_f32_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 0x800000 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x42000000 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc @@ -1762,17 +1794,18 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_not_b32_e32 v1, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_f32_fneg_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo ; GFX10-NEXT: v_log_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 @@ -1780,32 +1813,34 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) { ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo ; GFX10-NEXT: v_exp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_f32_fneg_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_log_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_exp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %pow = call float @llvm.pow.f32(float %x, float %neg.y) |
