diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 2463 |
1 files changed, 1453 insertions, 1010 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 6fe66655de3d..6d6d18de86ab 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -86,71 +86,78 @@ define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB1_3 -; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB1_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB1_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: .LBB1_4: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB1_4: -; SI-NEXT: s_branch .LBB1_2 ; ; VI-LABEL: bitcast_i64_to_f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB1_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB1_3 -; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB1_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB1_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: .LBB1_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB1_4: -; VI-NEXT: s_branch .LBB1_2 ; ; GFX9-LABEL: bitcast_i64_to_f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB1_3 -; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: .LBB1_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB1_4: -; GFX9-NEXT: s_branch .LBB1_2 ; ; GFX11-LABEL: bitcast_i64_to_f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB1_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 -; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: .LBB1_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB1_4: -; GFX11-NEXT: s_branch .LBB1_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -244,14 +251,16 @@ define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB3_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB3_4 -; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB3_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB3_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB3_3: -; SI-NEXT: s_branch .LBB3_2 ; SI-NEXT: .LBB3_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 @@ -261,14 +270,16 @@ define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB3_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB3_4 -; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB3_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB3_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB3_3: -; VI-NEXT: s_branch .LBB3_2 ; VI-NEXT: .LBB3_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -278,14 +289,16 @@ define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB3_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB3_3: -; GFX9-NEXT: s_branch .LBB3_2 ; GFX9-NEXT: .LBB3_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -295,16 +308,17 @@ define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB3_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 -; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB3_3: -; GFX11-NEXT: s_branch .LBB3_2 ; GFX11-NEXT: .LBB3_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -405,71 +419,78 @@ define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB5_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB5_3 -; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB5_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB5_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: .LBB5_4: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB5_4: -; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_i64_to_v2i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB5_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB5_3 -; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB5_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB5_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: .LBB5_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB5_4: -; VI-NEXT: s_branch .LBB5_2 ; ; GFX9-LABEL: bitcast_i64_to_v2i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB5_3 -; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB5_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: .LBB5_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB5_4: -; GFX9-NEXT: s_branch .LBB5_2 ; ; GFX11-LABEL: bitcast_i64_to_v2i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB5_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 -; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: .LBB5_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB5_4: -; GFX11-NEXT: s_branch .LBB5_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -566,71 +587,78 @@ define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB7_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB7_3 -; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB7_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB7_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: .LBB7_4: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB7_4: -; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: bitcast_v2i32_to_i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB7_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB7_3 -; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB7_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: .LBB7_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB7_4: -; VI-NEXT: s_branch .LBB7_2 ; ; GFX9-LABEL: bitcast_v2i32_to_i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB7_3 -; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: .LBB7_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_branch .LBB7_2 ; ; GFX11-LABEL: bitcast_v2i32_to_i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB7_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 -; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: .LBB7_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB7_4: -; GFX11-NEXT: s_branch .LBB7_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -728,71 +756,78 @@ define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB9_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB9_3 -; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB9_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB9_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: .LBB9_4: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB9_4: -; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_i64_to_v2f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB9_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB9_3 -; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB9_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: .LBB9_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB9_4: -; VI-NEXT: s_branch .LBB9_2 ; ; GFX9-LABEL: bitcast_i64_to_v2f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB9_3 -; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: .LBB9_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB9_4: -; GFX9-NEXT: s_branch .LBB9_2 ; ; GFX11-LABEL: bitcast_i64_to_v2f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB9_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 -; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: .LBB9_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB9_4: -; GFX11-NEXT: s_branch .LBB9_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -888,15 +923,17 @@ define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB11_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB11_4 -; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB11_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB11_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_3: -; SI-NEXT: s_branch .LBB11_2 ; SI-NEXT: .LBB11_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 @@ -906,15 +943,17 @@ define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB11_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB11_4 -; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB11_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB11_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB11_3: -; VI-NEXT: s_branch .LBB11_2 ; VI-NEXT: .LBB11_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -924,15 +963,17 @@ define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB11_4 -; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB11_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 ; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB11_3: -; GFX9-NEXT: s_branch .LBB11_2 ; GFX9-NEXT: .LBB11_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -942,17 +983,18 @@ define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB11_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 -; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB11_3: -; GFX11-NEXT: s_branch .LBB11_2 ; GFX11-NEXT: .LBB11_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1063,6 +1105,7 @@ define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -1083,60 +1126,67 @@ define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) ; SI-NEXT: .LBB13_4: ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB13_2 +; SI-NEXT: s_branch .LBB13_3 ; ; VI-LABEL: bitcast_i64_to_v4i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB13_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB13_3 -; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB13_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB13_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: .LBB13_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_4: -; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_i64_to_v4i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB13_3 -; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB13_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: .LBB13_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB13_4: -; GFX9-NEXT: s_branch .LBB13_2 ; ; GFX11-LABEL: bitcast_i64_to_v4i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB13_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 -; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: .LBB13_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB13_4: -; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -1264,6 +1314,7 @@ define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -1290,16 +1341,22 @@ define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB15_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB15_2 +; SI-NEXT: s_branch .LBB15_3 ; ; VI-LABEL: bitcast_v4i16_to_i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB15_3 -; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB15_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s5, s17, 3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff @@ -1310,26 +1367,26 @@ define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: .LBB15_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB15_4: -; VI-NEXT: s_branch .LBB15_2 ; ; GFX9-LABEL: bitcast_v4i16_to_i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB15_4 -; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB15_3: -; GFX9-NEXT: s_branch .LBB15_2 ; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -1339,17 +1396,18 @@ define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB15_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 -; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB15_3: -; GFX11-NEXT: s_branch .LBB15_2 ; GFX11-NEXT: .LBB15_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1478,6 +1536,7 @@ define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 @@ -1503,60 +1562,67 @@ define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB17_2 +; SI-NEXT: s_branch .LBB17_3 ; ; VI-LABEL: bitcast_i64_to_v4f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB17_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB17_3 -; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB17_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: .LBB17_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB17_4: -; VI-NEXT: s_branch .LBB17_2 ; ; GFX9-LABEL: bitcast_i64_to_v4f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB17_3 -; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: .LBB17_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB17_4: -; GFX9-NEXT: s_branch .LBB17_2 ; ; GFX11-LABEL: bitcast_i64_to_v4f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB17_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 -; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: .LBB17_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB17_4: -; GFX11-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -1698,6 +1764,7 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -1726,16 +1793,22 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB19_2 +; SI-NEXT: s_branch .LBB19_3 ; ; VI-LABEL: bitcast_v4f16_to_i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB19_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB19_4 -; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB19_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -1748,8 +1821,6 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; VI-NEXT: v_add_f16_e32 v0, s16, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB19_3: -; VI-NEXT: s_branch .LBB19_2 ; VI-NEXT: .LBB19_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -1759,16 +1830,18 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB19_4 -; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB19_3: -; GFX9-NEXT: s_branch .LBB19_2 ; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -1778,17 +1851,18 @@ define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB19_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 -; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB19_3: -; GFX11-NEXT: s_branch .LBB19_2 ; GFX11-NEXT: .LBB19_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1913,6 +1987,7 @@ define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB21_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 @@ -1938,60 +2013,67 @@ define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB21_2 +; SI-NEXT: s_branch .LBB21_3 ; ; VI-LABEL: bitcast_i64_to_v4bf16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB21_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB21_3 -; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB21_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: .LBB21_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB21_4: -; VI-NEXT: s_branch .LBB21_2 ; ; GFX9-LABEL: bitcast_i64_to_v4bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB21_3 -; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: .LBB21_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB21_4: -; GFX9-NEXT: s_branch .LBB21_2 ; ; GFX11-LABEL: bitcast_i64_to_v4bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB21_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 -; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: .LBB21_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB21_4: -; GFX11-NEXT: s_branch .LBB21_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2277,6 +2359,7 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 @@ -2301,16 +2384,22 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB23_2 +; SI-NEXT: s_branch .LBB23_3 ; ; VI-LABEL: bitcast_v4bf16_to_i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB23_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB23_4 -; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB23_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 @@ -2349,8 +2438,6 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB23_3: -; VI-NEXT: s_branch .LBB23_2 ; VI-NEXT: .LBB23_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -2360,10 +2447,14 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB23_4 -; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 @@ -2405,8 +2496,6 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB23_3: -; GFX9-NEXT: s_branch .LBB23_2 ; GFX9-NEXT: .LBB23_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -2416,12 +2505,15 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB23_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 -; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 @@ -2467,8 +2559,6 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB23_3: -; GFX11-NEXT: s_branch .LBB23_2 ; GFX11-NEXT: .LBB23_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2706,6 +2796,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -2740,12 +2831,15 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB25_2 +; SI-NEXT: s_branch .LBB25_3 ; ; VI-LABEL: bitcast_i64_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB25_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -2781,12 +2875,15 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr5 -; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB25_2 +; VI-NEXT: s_branch .LBB25_3 ; ; GFX9-LABEL: bitcast_i64_to_v8i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -2822,13 +2919,15 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; GFX9-NEXT: ; implicit-def: $sgpr9 ; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr5 -; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB25_2 +; GFX9-NEXT: s_branch .LBB25_3 ; ; GFX11-LABEL: bitcast_i64_to_v8i8_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 @@ -2837,8 +2936,7 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_lshr_b32 s6, s0, 16 ; GFX11-NEXT: s_lshr_b32 s7, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 -; GFX11-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-NEXT: s_cbranch_execnz .LBB25_3 ; GFX11-NEXT: .LBB25_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 @@ -2862,7 +2960,9 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; GFX11-NEXT: ; implicit-def: $sgpr5 ; GFX11-NEXT: ; implicit-def: $sgpr4 ; GFX11-NEXT: ; implicit-def: $sgpr3 -; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccz .LBB25_2 +; GFX11-NEXT: s_branch .LBB25_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3249,6 +3349,7 @@ define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -3303,12 +3404,15 @@ define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB27_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB27_2 +; SI-NEXT: s_branch .LBB27_3 ; ; VI-LABEL: bitcast_v8i8_to_i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -3363,12 +3467,15 @@ define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: ; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; VI-NEXT: s_branch .LBB27_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB27_2 +; VI-NEXT: s_branch .LBB27_3 ; ; GFX9-LABEL: bitcast_v8i8_to_i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -3423,35 +3530,36 @@ define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB27_4: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB27_2 +; GFX9-NEXT: s_branch .LBB27_3 ; ; GFX11-LABEL: bitcast_v8i8_to_i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_and_b32 s4, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 -; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_cbranch_execnz .LBB27_3 ; GFX11-NEXT: .LBB27_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s0, s0, 3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 @@ -3485,7 +3593,9 @@ define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB27_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccz .LBB27_2 +; GFX11-NEXT: s_branch .LBB27_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3579,14 +3689,16 @@ define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB29_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB29_4 -; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB29_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB29_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB29_3: -; SI-NEXT: s_branch .LBB29_2 ; SI-NEXT: .LBB29_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 @@ -3596,14 +3708,16 @@ define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB29_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB29_4 -; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB29_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB29_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB29_3: -; VI-NEXT: s_branch .LBB29_2 ; VI-NEXT: .LBB29_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -3613,14 +3727,16 @@ define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB29_4 -; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB29_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB29_3: -; GFX9-NEXT: s_branch .LBB29_2 ; GFX9-NEXT: .LBB29_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -3630,16 +3746,17 @@ define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB29_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 -; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB29_3: -; GFX11-NEXT: s_branch .LBB29_2 ; GFX11-NEXT: .LBB29_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3739,71 +3856,78 @@ define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB31_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB31_3 -; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB31_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB31_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: .LBB31_4: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB31_4: -; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v2i32_to_f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB31_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB31_3 -; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB31_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB31_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: .LBB31_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB31_4: -; VI-NEXT: s_branch .LBB31_2 ; ; GFX9-LABEL: bitcast_v2i32_to_f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB31_3 -; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB31_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: .LBB31_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB31_4: -; GFX9-NEXT: s_branch .LBB31_2 ; ; GFX11-LABEL: bitcast_v2i32_to_f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB31_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 -; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: .LBB31_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB31_4: -; GFX11-NEXT: s_branch .LBB31_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3897,14 +4021,16 @@ define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB33_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB33_4 -; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB33_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB33_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB33_3: -; SI-NEXT: s_branch .LBB33_2 ; SI-NEXT: .LBB33_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 @@ -3914,14 +4040,16 @@ define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB33_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB33_4 -; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB33_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB33_3: -; VI-NEXT: s_branch .LBB33_2 ; VI-NEXT: .LBB33_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -3931,14 +4059,16 @@ define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB33_4 -; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB33_3: -; GFX9-NEXT: s_branch .LBB33_2 ; GFX9-NEXT: .LBB33_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -3948,16 +4078,17 @@ define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB33_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 -; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB33_3: -; GFX11-NEXT: s_branch .LBB33_2 ; GFX11-NEXT: .LBB33_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4056,15 +4187,17 @@ define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB35_3 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB35_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB35_4 -; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB35_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB35_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB35_3: -; SI-NEXT: s_branch .LBB35_2 ; SI-NEXT: .LBB35_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 @@ -4074,15 +4207,17 @@ define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB35_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB35_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB35_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: -; VI-NEXT: s_branch .LBB35_2 ; VI-NEXT: .LBB35_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -4092,15 +4227,17 @@ define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB35_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 ; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: -; GFX9-NEXT: s_branch .LBB35_2 ; GFX9-NEXT: .LBB35_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -4110,17 +4247,18 @@ define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB35_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 -; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB35_3: -; GFX11-NEXT: s_branch .LBB35_2 ; GFX11-NEXT: .LBB35_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4229,6 +4367,7 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB37_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -4243,7 +4382,8 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg % ; SI-NEXT: .LBB37_3: ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB37_2 ; SI-NEXT: .LBB37_4: ; SI-NEXT: v_mov_b32_e32 v5, s17 ; SI-NEXT: v_mov_b32_e32 v4, s16 @@ -4257,14 +4397,16 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg % ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB37_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB37_4 -; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB37_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB37_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB37_3: -; VI-NEXT: s_branch .LBB37_2 ; VI-NEXT: .LBB37_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -4274,14 +4416,16 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB37_4 -; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB37_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB37_3: -; GFX9-NEXT: s_branch .LBB37_2 ; GFX9-NEXT: .LBB37_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -4291,16 +4435,17 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB37_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 -; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB37_3: -; GFX11-NEXT: s_branch .LBB37_2 ; GFX11-NEXT: .LBB37_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4431,6 +4576,7 @@ define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -4457,16 +4603,22 @@ define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg % ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB39_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB39_2 +; SI-NEXT: s_branch .LBB39_3 ; ; VI-LABEL: bitcast_v4i16_to_f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB39_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB39_3 -; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB39_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB39_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s5, s17, 3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff @@ -4477,26 +4629,26 @@ define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg % ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: .LBB39_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB39_4: -; VI-NEXT: s_branch .LBB39_2 ; ; GFX9-LABEL: bitcast_v4i16_to_f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB39_4 -; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB39_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB39_3: -; GFX9-NEXT: s_branch .LBB39_2 ; GFX9-NEXT: .LBB39_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -4506,17 +4658,18 @@ define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB39_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 -; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB39_3: -; GFX11-NEXT: s_branch .LBB39_2 ; GFX11-NEXT: .LBB39_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4636,6 +4789,7 @@ define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 @@ -4660,20 +4814,24 @@ define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB41_2 +; SI-NEXT: s_branch .LBB41_3 ; ; VI-LABEL: bitcast_f64_to_v4f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB41_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB41_4 -; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB41_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB41_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB41_3: -; VI-NEXT: s_branch .LBB41_2 ; VI-NEXT: .LBB41_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -4683,14 +4841,16 @@ define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB41_4 -; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB41_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB41_3: -; GFX9-NEXT: s_branch .LBB41_2 ; GFX9-NEXT: .LBB41_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -4700,16 +4860,17 @@ define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB41_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 -; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB41_3: -; GFX11-NEXT: s_branch .LBB41_2 ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4854,6 +5015,7 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -4882,16 +5044,22 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB43_2 +; SI-NEXT: s_branch .LBB43_3 ; ; VI-LABEL: bitcast_v4f16_to_f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB43_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB43_4 -; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB43_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB43_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -4904,8 +5072,6 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; VI-NEXT: v_add_f16_e32 v0, s16, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB43_3: -; VI-NEXT: s_branch .LBB43_2 ; VI-NEXT: .LBB43_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -4915,16 +5081,18 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB43_4 -; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB43_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB43_3: -; GFX9-NEXT: s_branch .LBB43_2 ; GFX9-NEXT: .LBB43_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -4934,17 +5102,18 @@ define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB43_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 -; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB43_3: -; GFX11-NEXT: s_branch .LBB43_2 ; GFX11-NEXT: .LBB43_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5058,6 +5227,7 @@ define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB45_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 @@ -5077,7 +5247,8 @@ define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inr ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB45_2 ; SI-NEXT: .LBB45_4: ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: v_mov_b32_e32 v2, s8 @@ -5089,14 +5260,16 @@ define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inr ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB45_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB45_4 -; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB45_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB45_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB45_3: -; VI-NEXT: s_branch .LBB45_2 ; VI-NEXT: .LBB45_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -5106,14 +5279,16 @@ define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inr ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB45_4 -; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB45_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB45_3: -; GFX9-NEXT: s_branch .LBB45_2 ; GFX9-NEXT: .LBB45_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -5123,16 +5298,17 @@ define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB45_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 -; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB45_3: -; GFX11-NEXT: s_branch .LBB45_2 ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5421,6 +5597,7 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 @@ -5445,16 +5622,22 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB47_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB47_2 +; SI-NEXT: s_branch .LBB47_3 ; ; VI-LABEL: bitcast_v4bf16_to_f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB47_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB47_4 -; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB47_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB47_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 @@ -5493,8 +5676,6 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB47_3: -; VI-NEXT: s_branch .LBB47_2 ; VI-NEXT: .LBB47_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -5504,10 +5685,14 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB47_4 -; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB47_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 @@ -5549,8 +5734,6 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB47_3: -; GFX9-NEXT: s_branch .LBB47_2 ; GFX9-NEXT: .LBB47_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -5560,12 +5743,15 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB47_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 -; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 @@ -5611,8 +5797,6 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB47_3: -; GFX11-NEXT: s_branch .LBB47_2 ; GFX11-NEXT: .LBB47_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5843,6 +6027,7 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -5869,7 +6054,8 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB49_2 ; SI-NEXT: .LBB49_4: ; SI-NEXT: v_mov_b32_e32 v9, s17 ; SI-NEXT: v_mov_b32_e32 v8, s16 @@ -5885,6 +6071,7 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB49_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -5910,7 +6097,8 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr9 -; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB49_2 ; VI-NEXT: .LBB49_4: ; VI-NEXT: v_mov_b32_e32 v8, s16 ; VI-NEXT: v_mov_b32_e32 v9, s17 @@ -5929,6 +6117,7 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -5954,7 +6143,8 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; GFX9-NEXT: ; implicit-def: $sgpr5 ; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr9 -; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB49_2 ; GFX9-NEXT: .LBB49_4: ; GFX9-NEXT: v_mov_b32_e32 v8, s16 ; GFX9-NEXT: v_mov_b32_e32 v9, s17 @@ -5973,17 +6163,16 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 -; GFX11-NEXT: s_lshr_b32 s6, s1, 24 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-NEXT: s_lshr_b32 s3, s1, 8 -; GFX11-NEXT: s_lshr_b32 s8, s0, 16 -; GFX11-NEXT: s_lshr_b32 s7, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_cbranch_execnz .LBB49_4 ; GFX11-NEXT: .LBB49_2: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[8:9], s[0:1], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5995,18 +6184,19 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-NEXT: s_branch .LBB49_5 ; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: ; implicit-def: $sgpr6 ; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr8 ; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr3 +; GFX11-NEXT: ; implicit-def: $sgpr4 ; GFX11-NEXT: ; implicit-def: $sgpr5 -; GFX11-NEXT: ; implicit-def: $sgpr6 -; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccz .LBB49_2 ; GFX11-NEXT: .LBB49_4: ; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v6, s5 -; GFX11-NEXT: v_mov_b32_e32 v7, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v6, s4 +; GFX11-NEXT: v_mov_b32_e32 v7, s5 ; GFX11-NEXT: v_mov_b32_e32 v5, s3 ; GFX11-NEXT: .LBB49_5: ; %end ; GFX11-NEXT: v_mov_b32_e32 v0, v8 @@ -6398,6 +6588,7 @@ define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -6452,12 +6643,15 @@ define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB51_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB51_2 +; SI-NEXT: s_branch .LBB51_3 ; ; VI-LABEL: bitcast_v8i8_to_f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -6512,12 +6706,15 @@ define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: ; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; VI-NEXT: s_branch .LBB51_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB51_2 +; VI-NEXT: s_branch .LBB51_3 ; ; GFX9-LABEL: bitcast_v8i8_to_f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -6572,35 +6769,36 @@ define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB51_4: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB51_2 +; GFX9-NEXT: s_branch .LBB51_3 ; ; GFX11-LABEL: bitcast_v8i8_to_f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_and_b32 s4, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 -; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_cbranch_execnz .LBB51_3 ; GFX11-NEXT: .LBB51_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s0, s0, 3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 @@ -6634,7 +6832,9 @@ define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB51_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccz .LBB51_2 +; GFX11-NEXT: s_branch .LBB51_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6731,71 +6931,78 @@ define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB53_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB53_3 -; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB53_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB53_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: .LBB53_4: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB53_4: -; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v2i32_to_v2f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB53_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB53_3 -; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB53_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB53_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: .LBB53_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB53_4: -; VI-NEXT: s_branch .LBB53_2 ; ; GFX9-LABEL: bitcast_v2i32_to_v2f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB53_3 -; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB53_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: .LBB53_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB53_4: -; GFX9-NEXT: s_branch .LBB53_2 ; ; GFX11-LABEL: bitcast_v2i32_to_v2f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB53_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 -; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: .LBB53_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB53_4: -; GFX11-NEXT: s_branch .LBB53_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6891,15 +7098,17 @@ define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB55_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB55_4 -; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB55_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB55_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB55_3: -; SI-NEXT: s_branch .LBB55_2 ; SI-NEXT: .LBB55_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 @@ -6909,15 +7118,17 @@ define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB55_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB55_4 -; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB55_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB55_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB55_3: -; VI-NEXT: s_branch .LBB55_2 ; VI-NEXT: .LBB55_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -6927,15 +7138,17 @@ define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB55_4 -; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB55_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 ; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB55_3: -; GFX9-NEXT: s_branch .LBB55_2 ; GFX9-NEXT: .LBB55_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -6945,17 +7158,18 @@ define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB55_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 -; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB55_3: -; GFX11-NEXT: s_branch .LBB55_2 ; GFX11-NEXT: .LBB55_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7065,6 +7279,7 @@ define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 in ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -7085,60 +7300,67 @@ define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 in ; SI-NEXT: .LBB57_4: ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB57_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB57_2 +; SI-NEXT: s_branch .LBB57_3 ; ; VI-LABEL: bitcast_v2i32_to_v4i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB57_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB57_3 -; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB57_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB57_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: .LBB57_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB57_4: -; VI-NEXT: s_branch .LBB57_2 ; ; GFX9-LABEL: bitcast_v2i32_to_v4i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB57_3 -; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB57_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: .LBB57_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB57_4: -; GFX9-NEXT: s_branch .LBB57_2 ; ; GFX11-LABEL: bitcast_v2i32_to_v4i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB57_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 -; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: .LBB57_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB57_4: -; GFX11-NEXT: s_branch .LBB57_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7266,6 +7488,7 @@ define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 in ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -7292,16 +7515,22 @@ define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 in ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB59_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB59_2 +; SI-NEXT: s_branch .LBB59_3 ; ; VI-LABEL: bitcast_v4i16_to_v2i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB59_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB59_3 -; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB59_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s5, s17, 3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff @@ -7312,26 +7541,26 @@ define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 in ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: .LBB59_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB59_4: -; VI-NEXT: s_branch .LBB59_2 ; ; GFX9-LABEL: bitcast_v4i16_to_v2i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB59_4 -; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB59_3: -; GFX9-NEXT: s_branch .LBB59_2 ; GFX9-NEXT: .LBB59_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -7341,17 +7570,18 @@ define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB59_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 -; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB59_3: -; GFX11-NEXT: s_branch .LBB59_2 ; GFX11-NEXT: .LBB59_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7479,6 +7709,7 @@ define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 @@ -7504,60 +7735,67 @@ define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 i ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB61_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB61_2 +; SI-NEXT: s_branch .LBB61_3 ; ; VI-LABEL: bitcast_v2i32_to_v4f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB61_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB61_3 -; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB61_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB61_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: .LBB61_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB61_4: -; VI-NEXT: s_branch .LBB61_2 ; ; GFX9-LABEL: bitcast_v2i32_to_v4f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB61_3 -; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB61_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: .LBB61_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB61_4: -; GFX9-NEXT: s_branch .LBB61_2 ; ; GFX11-LABEL: bitcast_v2i32_to_v4f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB61_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 -; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: .LBB61_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB61_4: -; GFX11-NEXT: s_branch .LBB61_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7699,6 +7937,7 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -7727,16 +7966,22 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB63_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB63_2 +; SI-NEXT: s_branch .LBB63_3 ; ; VI-LABEL: bitcast_v4f16_to_v2i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB63_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB63_4 -; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB63_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB63_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -7749,8 +7994,6 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; VI-NEXT: v_add_f16_e32 v0, s16, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB63_3: -; VI-NEXT: s_branch .LBB63_2 ; VI-NEXT: .LBB63_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -7760,16 +8003,18 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB63_4 -; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB63_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB63_3: -; GFX9-NEXT: s_branch .LBB63_2 ; GFX9-NEXT: .LBB63_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -7779,17 +8024,18 @@ define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB63_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 -; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB63_3: -; GFX11-NEXT: s_branch .LBB63_2 ; GFX11-NEXT: .LBB63_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7913,6 +8159,7 @@ define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB65_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 @@ -7938,60 +8185,67 @@ define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB65_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB65_2 +; SI-NEXT: s_branch .LBB65_3 ; ; VI-LABEL: bitcast_v2i32_to_v4bf16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB65_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB65_3 -; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB65_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB65_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: .LBB65_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB65_4: -; VI-NEXT: s_branch .LBB65_2 ; ; GFX9-LABEL: bitcast_v2i32_to_v4bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB65_3 -; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB65_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB65_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: .LBB65_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB65_4: -; GFX9-NEXT: s_branch .LBB65_2 ; ; GFX11-LABEL: bitcast_v2i32_to_v4bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB65_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 -; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB65_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: .LBB65_4: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB65_4: -; GFX11-NEXT: s_branch .LBB65_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8277,6 +8531,7 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 @@ -8301,16 +8556,22 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB67_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB67_2 +; SI-NEXT: s_branch .LBB67_3 ; ; VI-LABEL: bitcast_v4bf16_to_v2i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB67_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB67_4 -; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB67_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB67_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 @@ -8349,8 +8610,6 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB67_3: -; VI-NEXT: s_branch .LBB67_2 ; VI-NEXT: .LBB67_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -8360,10 +8619,14 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB67_4 -; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB67_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 @@ -8405,8 +8668,6 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB67_3: -; GFX9-NEXT: s_branch .LBB67_2 ; GFX9-NEXT: .LBB67_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -8416,12 +8677,15 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB67_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 -; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 @@ -8467,8 +8731,6 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB67_3: -; GFX11-NEXT: s_branch .LBB67_2 ; GFX11-NEXT: .LBB67_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8704,6 +8966,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -8738,12 +9001,15 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB69_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB69_2 +; SI-NEXT: s_branch .LBB69_3 ; ; VI-LABEL: bitcast_v2i32_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB69_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -8779,12 +9045,15 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr5 -; VI-NEXT: s_branch .LBB69_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB69_2 +; VI-NEXT: s_branch .LBB69_3 ; ; GFX9-LABEL: bitcast_v2i32_to_v8i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -8820,13 +9089,15 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; GFX9-NEXT: ; implicit-def: $sgpr9 ; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr5 -; GFX9-NEXT: s_branch .LBB69_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB69_2 +; GFX9-NEXT: s_branch .LBB69_3 ; ; GFX11-LABEL: bitcast_v2i32_to_v8i8_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB69_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 @@ -8835,8 +9106,7 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; GFX11-NEXT: s_lshr_b32 s5, s1, 8 ; GFX11-NEXT: s_lshr_b32 s6, s0, 16 ; GFX11-NEXT: s_lshr_b32 s7, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 -; GFX11-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-NEXT: s_cbranch_execnz .LBB69_3 ; GFX11-NEXT: .LBB69_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 @@ -8860,7 +9130,9 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; GFX11-NEXT: ; implicit-def: $sgpr5 ; GFX11-NEXT: ; implicit-def: $sgpr4 ; GFX11-NEXT: ; implicit-def: $sgpr3 -; GFX11-NEXT: s_branch .LBB69_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccz .LBB69_2 +; GFX11-NEXT: s_branch .LBB69_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9247,6 +9519,7 @@ define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -9301,12 +9574,15 @@ define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inre ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB71_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB71_2 +; SI-NEXT: s_branch .LBB71_3 ; ; VI-LABEL: bitcast_v8i8_to_v2i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -9361,12 +9637,15 @@ define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inre ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: ; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; VI-NEXT: s_branch .LBB71_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB71_2 +; VI-NEXT: s_branch .LBB71_3 ; ; GFX9-LABEL: bitcast_v8i8_to_v2i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -9421,35 +9700,36 @@ define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inre ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB71_4: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB71_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB71_2 +; GFX9-NEXT: s_branch .LBB71_3 ; ; GFX11-LABEL: bitcast_v8i8_to_v2i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_and_b32 s4, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 -; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_cbranch_execnz .LBB71_3 ; GFX11-NEXT: .LBB71_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s0, s0, 3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 @@ -9483,7 +9763,9 @@ define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inre ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB71_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB71_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccz .LBB71_2 +; GFX11-NEXT: s_branch .LBB71_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9589,6 +9871,7 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB73_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -9604,7 +9887,8 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 ; SI-NEXT: .LBB73_3: ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB73_2 ; SI-NEXT: .LBB73_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v2, s17 @@ -9615,15 +9899,17 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB73_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB73_4 -; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB73_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB73_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB73_3: -; VI-NEXT: s_branch .LBB73_2 ; VI-NEXT: .LBB73_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -9633,15 +9919,17 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB73_4 -; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB73_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 ; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB73_3: -; GFX9-NEXT: s_branch .LBB73_2 ; GFX9-NEXT: .LBB73_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -9651,17 +9939,18 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB73_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 -; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB73_3: -; GFX11-NEXT: s_branch .LBB73_2 ; GFX11-NEXT: .LBB73_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9792,6 +10081,7 @@ define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB75_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -9818,16 +10108,22 @@ define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB75_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB75_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB75_2 +; SI-NEXT: s_branch .LBB75_3 ; ; VI-LABEL: bitcast_v4i16_to_v2f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB75_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB75_3 -; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB75_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB75_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s5, s17, 3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: s_and_b32 s5, s5, 0xffff @@ -9838,26 +10134,26 @@ define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: .LBB75_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB75_4: -; VI-NEXT: s_branch .LBB75_2 ; ; GFX9-LABEL: bitcast_v4i16_to_v2f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB75_4 -; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB75_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB75_3: -; GFX9-NEXT: s_branch .LBB75_2 ; GFX9-NEXT: .LBB75_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -9867,17 +10163,18 @@ define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB75_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 -; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB75_3: -; GFX11-NEXT: s_branch .LBB75_2 ; GFX11-NEXT: .LBB75_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10004,6 +10301,7 @@ define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s17, 16 @@ -10029,21 +10327,25 @@ define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB77_2 +; SI-NEXT: s_branch .LBB77_3 ; ; VI-LABEL: bitcast_v2f32_to_v4f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB77_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB77_4 -; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB77_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB77_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB77_3: -; VI-NEXT: s_branch .LBB77_2 ; VI-NEXT: .LBB77_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -10053,15 +10355,17 @@ define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB77_4 -; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB77_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 ; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB77_3: -; GFX9-NEXT: s_branch .LBB77_2 ; GFX9-NEXT: .LBB77_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -10071,17 +10375,18 @@ define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB77_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 -; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB77_3: -; GFX11-NEXT: s_branch .LBB77_2 ; GFX11-NEXT: .LBB77_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10226,6 +10531,7 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 @@ -10254,16 +10560,22 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB79_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB79_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB79_2 +; SI-NEXT: s_branch .LBB79_3 ; ; VI-LABEL: bitcast_v4f16_to_v2f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB79_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB79_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB79_4 -; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB79_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB79_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -10276,8 +10588,6 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; VI-NEXT: v_add_f16_e32 v0, s16, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB79_3: -; VI-NEXT: s_branch .LBB79_2 ; VI-NEXT: .LBB79_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -10287,16 +10597,18 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB79_4 -; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB79_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB79_3: -; GFX9-NEXT: s_branch .LBB79_2 ; GFX9-NEXT: .LBB79_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -10306,17 +10618,18 @@ define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB79_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 -; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB79_3: -; GFX11-NEXT: s_branch .LBB79_2 ; GFX11-NEXT: .LBB79_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10439,6 +10752,7 @@ define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB81_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 @@ -10459,7 +10773,8 @@ define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB81_2 ; SI-NEXT: .LBB81_4: ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v1, s8 @@ -10471,15 +10786,17 @@ define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB81_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB81_4 -; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB81_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB81_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB81_3: -; VI-NEXT: s_branch .LBB81_2 ; VI-NEXT: .LBB81_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -10489,15 +10806,17 @@ define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB81_4 -; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB81_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 ; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB81_3: -; GFX9-NEXT: s_branch .LBB81_2 ; GFX9-NEXT: .LBB81_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -10507,17 +10826,18 @@ define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB81_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 -; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB81_3: -; GFX11-NEXT: s_branch .LBB81_2 ; GFX11-NEXT: .LBB81_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10806,6 +11126,7 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 @@ -10830,16 +11151,22 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB83_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB83_2 +; SI-NEXT: s_branch .LBB83_3 ; ; VI-LABEL: bitcast_v4bf16_to_v2f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB83_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB83_4 -; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB83_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB83_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 @@ -10878,8 +11205,6 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB83_3: -; VI-NEXT: s_branch .LBB83_2 ; VI-NEXT: .LBB83_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -10889,10 +11214,14 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB83_4 -; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB83_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 @@ -10934,8 +11263,6 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB83_3: -; GFX9-NEXT: s_branch .LBB83_2 ; GFX9-NEXT: .LBB83_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -10945,12 +11272,15 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB83_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB83_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB83_4 -; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 @@ -10996,8 +11326,6 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB83_3: -; GFX11-NEXT: s_branch .LBB83_2 ; GFX11-NEXT: .LBB83_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11231,6 +11559,7 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v0, s16 @@ -11258,7 +11587,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB85_2 ; SI-NEXT: .LBB85_4: ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v4, s17 @@ -11271,6 +11601,7 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB85_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -11297,7 +11628,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr5 -; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB85_2 ; VI-NEXT: .LBB85_4: ; VI-NEXT: v_mov_b32_e32 v8, s16 ; VI-NEXT: v_mov_b32_e32 v9, s17 @@ -11316,6 +11648,7 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -11342,7 +11675,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; GFX9-NEXT: ; implicit-def: $sgpr9 ; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr5 -; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB85_2 ; GFX9-NEXT: .LBB85_4: ; GFX9-NEXT: v_mov_b32_e32 v8, s16 ; GFX9-NEXT: v_mov_b32_e32 v9, s17 @@ -11361,17 +11695,16 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB85_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 ; GFX11-NEXT: s_lshr_b32 s3, s1, 24 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_lshr_b32 s8, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-NEXT: s_cbranch_execnz .LBB85_4 ; GFX11-NEXT: .LBB85_2: ; %cmp.true ; GFX11-NEXT: v_add_f32_e64 v9, s1, 1.0 ; GFX11-NEXT: v_add_f32_e64 v8, s0, 1.0 @@ -11384,17 +11717,18 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-NEXT: s_branch .LBB85_5 ; GFX11-NEXT: .LBB85_3: -; GFX11-NEXT: ; implicit-def: $sgpr8 ; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: ; implicit-def: $sgpr4 ; GFX11-NEXT: ; implicit-def: $sgpr3 -; GFX11-NEXT: s_branch .LBB85_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccz .LBB85_2 ; GFX11-NEXT: .LBB85_4: ; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 -; GFX11-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: v_mov_b32_e32 v7, s3 ; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: .LBB85_5: ; %end @@ -11787,6 +12121,7 @@ define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 in ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_mov_b64 s[6:7], -1 ; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -11841,12 +12176,15 @@ define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 in ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB87_4: ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_branch .LBB87_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_cbranch_vccz .LBB87_2 +; SI-NEXT: s_branch .LBB87_3 ; ; VI-LABEL: bitcast_v8i8_to_v2f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -11901,12 +12239,15 @@ define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 in ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: ; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; VI-NEXT: s_branch .LBB87_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB87_2 +; VI-NEXT: s_branch .LBB87_3 ; ; GFX9-LABEL: bitcast_v8i8_to_v2f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -11961,35 +12302,36 @@ define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 in ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB87_4: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB87_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB87_2 +; GFX9-NEXT: s_branch .LBB87_3 ; ; GFX11-LABEL: bitcast_v8i8_to_v2f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_and_b32 s4, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 -; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_cbranch_execnz .LBB87_3 ; GFX11-NEXT: .LBB87_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s0, s0, 3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 @@ -12023,7 +12365,9 @@ define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 in ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB87_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB87_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccz .LBB87_2 +; GFX11-NEXT: s_branch .LBB87_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12155,6 +12499,7 @@ define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 @@ -12178,16 +12523,22 @@ define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 i ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB89_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB89_2 +; SI-NEXT: s_branch .LBB89_3 ; ; VI-LABEL: bitcast_v4i16_to_v4f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB89_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB89_3 -; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB89_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB89_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s5, s16, 3 ; VI-NEXT: s_add_i32 s7, s17, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 @@ -12198,26 +12549,26 @@ define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 i ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: .LBB89_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB89_4: -; VI-NEXT: s_branch .LBB89_2 ; ; GFX9-LABEL: bitcast_v4i16_to_v4f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB89_4 -; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB89_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB89_3: -; GFX9-NEXT: s_branch .LBB89_2 ; GFX9-NEXT: .LBB89_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -12227,17 +12578,18 @@ define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB89_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 -; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB89_3: -; GFX11-NEXT: s_branch .LBB89_2 ; GFX11-NEXT: .LBB89_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12367,10 +12719,14 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB91_3 -; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB91_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB91_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -12388,19 +12744,21 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: .LBB91_4: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB91_4: -; SI-NEXT: s_branch .LBB91_2 ; ; VI-LABEL: bitcast_v4f16_to_v4i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB91_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB91_4 -; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB91_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB91_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshr_b32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 @@ -12413,8 +12771,6 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v2, v3 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB91_3: -; VI-NEXT: s_branch .LBB91_2 ; VI-NEXT: .LBB91_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -12424,16 +12780,18 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB91_4 -; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB91_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB91_3: -; GFX9-NEXT: s_branch .LBB91_2 ; GFX9-NEXT: .LBB91_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -12443,17 +12801,18 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB91_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 -; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB91_3: -; GFX11-NEXT: s_branch .LBB91_2 ; GFX11-NEXT: .LBB91_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12586,6 +12945,7 @@ define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshl_b32 s6, s16, 16 @@ -12619,16 +12979,22 @@ define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_branch .LBB93_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB93_2 +; SI-NEXT: s_branch .LBB93_3 ; ; VI-LABEL: bitcast_v4i16_to_v4bf16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB93_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB93_3 -; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB93_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB93_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s5, s16, 3 ; VI-NEXT: s_add_i32 s7, s17, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 @@ -12639,26 +13005,26 @@ define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s17, s6, 0x30000 ; VI-NEXT: s_add_i32 s16, s4, 0x30000 -; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: .LBB93_4: ; %end ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB93_4: -; VI-NEXT: s_branch .LBB93_2 ; ; GFX9-LABEL: bitcast_v4i16_to_v4bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB93_4 -; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB93_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB93_3: -; GFX9-NEXT: s_branch .LBB93_2 ; GFX9-NEXT: .LBB93_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -12668,17 +13034,18 @@ define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB93_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 -; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB93_3: -; GFX11-NEXT: s_branch .LBB93_2 ; GFX11-NEXT: .LBB93_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12975,6 +13342,7 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s19 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 @@ -13004,16 +13372,22 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB95_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB95_2 +; SI-NEXT: s_branch .LBB95_3 ; ; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB95_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB95_4 -; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB95_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB95_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 @@ -13052,8 +13426,6 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 ; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB95_3: -; VI-NEXT: s_branch .LBB95_2 ; VI-NEXT: .LBB95_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -13063,10 +13435,14 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB95_4 -; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB95_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 @@ -13106,8 +13482,6 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; GFX9-NEXT: v_and_or_b32 v0, v3, v4, v0 ; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB95_3: -; GFX9-NEXT: s_branch .LBB95_2 ; GFX9-NEXT: .LBB95_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -13117,12 +13491,15 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB95_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB95_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 -; GFX11-NEXT: .LBB95_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 @@ -13162,8 +13539,6 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB95_3: -; GFX11-NEXT: s_branch .LBB95_2 ; GFX11-NEXT: .LBB95_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13427,6 +13802,7 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -13477,12 +13853,15 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_branch .LBB97_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB97_2 +; SI-NEXT: s_branch .LBB97_3 ; ; VI-LABEL: bitcast_v4i16_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB97_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -13528,12 +13907,15 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: s_branch .LBB97_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB97_2 +; VI-NEXT: s_branch .LBB97_3 ; ; GFX9-LABEL: bitcast_v4i16_to_v8i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -13560,7 +13942,8 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; GFX9-NEXT: ; implicit-def: $sgpr9 ; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr5 -; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB97_2 ; GFX9-NEXT: .LBB97_4: ; GFX9-NEXT: v_mov_b32_e32 v8, s16 ; GFX9-NEXT: v_mov_b32_e32 v9, s17 @@ -13579,17 +13962,16 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB97_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 ; GFX11-NEXT: s_lshr_b32 s3, s1, 24 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_lshr_b32 s8, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-NEXT: s_cbranch_execnz .LBB97_4 ; GFX11-NEXT: .LBB97_2: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] @@ -13602,17 +13984,18 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-NEXT: s_branch .LBB97_5 ; GFX11-NEXT: .LBB97_3: -; GFX11-NEXT: ; implicit-def: $sgpr8 ; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: ; implicit-def: $sgpr4 ; GFX11-NEXT: ; implicit-def: $sgpr3 -; GFX11-NEXT: s_branch .LBB97_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccz .LBB97_2 ; GFX11-NEXT: .LBB97_4: ; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 -; GFX11-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: v_mov_b32_e32 v7, s3 ; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: .LBB97_5: ; %end @@ -14013,6 +14396,7 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s20, 0xff @@ -14077,12 +14461,15 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_branch .LBB99_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB99_2 +; SI-NEXT: s_branch .LBB99_3 ; ; VI-LABEL: bitcast_v8i8_to_v4i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -14137,12 +14524,15 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB99_4: ; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; VI-NEXT: s_branch .LBB99_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB99_2 +; VI-NEXT: s_branch .LBB99_3 ; ; GFX9-LABEL: bitcast_v8i8_to_v4i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -14197,35 +14587,36 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB99_4: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB99_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB99_2 +; GFX9-NEXT: s_branch .LBB99_3 ; ; GFX11-LABEL: bitcast_v8i8_to_v4i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_and_b32 s4, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 -; GFX11-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_cbranch_execnz .LBB99_3 ; GFX11-NEXT: .LBB99_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s0, s0, 3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 @@ -14259,7 +14650,9 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB99_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB99_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccz .LBB99_2 +; GFX11-NEXT: s_branch .LBB99_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14404,6 +14797,7 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v6, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB101_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 @@ -14435,16 +14829,22 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB101_2 +; SI-NEXT: s_branch .LBB101_3 ; ; VI-LABEL: bitcast_v4f16_to_v4bf16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB101_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB101_4 -; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB101_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB101_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshr_b32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 @@ -14457,8 +14857,6 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v2, v3 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB101_3: -; VI-NEXT: s_branch .LBB101_2 ; VI-NEXT: .LBB101_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -14468,16 +14866,18 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB101_4 -; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB101_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB101_3: -; GFX9-NEXT: s_branch .LBB101_2 ; GFX9-NEXT: .LBB101_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -14487,17 +14887,18 @@ define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB101_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 -; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB101_3: -; GFX11-NEXT: s_branch .LBB101_2 ; GFX11-NEXT: .LBB101_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14797,6 +15198,7 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 @@ -14832,16 +15234,22 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB103_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB103_2 +; SI-NEXT: s_branch .LBB103_3 ; ; VI-LABEL: bitcast_v4bf16_to_v4f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 -; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB103_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB103_4 -; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB103_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB103_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 @@ -14880,8 +15288,6 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 ; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB103_3: -; VI-NEXT: s_branch .LBB103_2 ; VI-NEXT: .LBB103_4: ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 @@ -14891,10 +15297,14 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB103_4 -; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB103_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 @@ -14936,8 +15346,6 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB103_3: -; GFX9-NEXT: s_branch .LBB103_2 ; GFX9-NEXT: .LBB103_4: ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 @@ -14947,12 +15355,15 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB103_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB103_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB103_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccnz .LBB103_4 -; GFX11-NEXT: .LBB103_2: ; %cmp.true +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 @@ -14997,8 +15408,6 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB103_3: -; GFX11-NEXT: s_branch .LBB103_2 ; GFX11-NEXT: .LBB103_4: ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15267,6 +15676,7 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 ; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 @@ -15311,12 +15721,15 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_branch .LBB105_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB105_2 +; SI-NEXT: s_branch .LBB105_3 ; ; VI-LABEL: bitcast_v4f16_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB105_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -15351,7 +15764,8 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB105_2 ; VI-NEXT: .LBB105_4: ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v6, s10 @@ -15367,6 +15781,7 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -15394,7 +15809,8 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; GFX9-NEXT: ; implicit-def: $sgpr9 ; GFX9-NEXT: ; implicit-def: $sgpr8 ; GFX9-NEXT: ; implicit-def: $sgpr5 -; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB105_2 ; GFX9-NEXT: .LBB105_4: ; GFX9-NEXT: v_mov_b32_e32 v8, s16 ; GFX9-NEXT: v_mov_b32_e32 v9, s17 @@ -15413,17 +15829,16 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB105_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 ; GFX11-NEXT: s_lshr_b32 s3, s1, 24 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_lshr_b32 s8, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-NEXT: s_cbranch_execnz .LBB105_4 ; GFX11-NEXT: .LBB105_2: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] @@ -15436,17 +15851,18 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-NEXT: s_branch .LBB105_5 ; GFX11-NEXT: .LBB105_3: -; GFX11-NEXT: ; implicit-def: $sgpr8 ; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr6 +; GFX11-NEXT: ; implicit-def: $sgpr2 ; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: ; implicit-def: $sgpr4 ; GFX11-NEXT: ; implicit-def: $sgpr3 -; GFX11-NEXT: s_branch .LBB105_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccz .LBB105_2 ; GFX11-NEXT: .LBB105_4: ; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 -; GFX11-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s4 ; GFX11-NEXT: v_mov_b32_e32 v7, s3 ; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: .LBB105_5: ; %end @@ -15839,6 +16255,7 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB107_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -15890,12 +16307,15 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_branch .LBB107_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB107_2 +; SI-NEXT: s_branch .LBB107_3 ; ; VI-LABEL: bitcast_v8i8_to_v4f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -15950,12 +16370,15 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB107_4: ; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; VI-NEXT: s_branch .LBB107_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB107_2 +; VI-NEXT: s_branch .LBB107_3 ; ; GFX9-LABEL: bitcast_v8i8_to_v4f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -16010,35 +16433,36 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB107_4: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB107_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB107_2 +; GFX9-NEXT: s_branch .LBB107_3 ; ; GFX11-LABEL: bitcast_v8i8_to_v4f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_and_b32 s4, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 -; GFX11-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_cbranch_execnz .LBB107_3 ; GFX11-NEXT: .LBB107_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s0, s0, 3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 @@ -16072,7 +16496,9 @@ define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inr ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB107_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB107_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccz .LBB107_2 +; GFX11-NEXT: s_branch .LBB107_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -16475,6 +16901,7 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 @@ -16516,12 +16943,15 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_branch .LBB109_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB109_2 +; SI-NEXT: s_branch .LBB109_3 ; ; VI-LABEL: bitcast_v4bf16_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -16585,7 +17015,8 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr5 ; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB109_2 ; VI-NEXT: .LBB109_4: ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s10 @@ -16601,6 +17032,7 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 @@ -16666,7 +17098,8 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr10 ; GFX9-NEXT: ; implicit-def: $sgpr11 ; GFX9-NEXT: ; implicit-def: $sgpr9 -; GFX9-NEXT: s_branch .LBB109_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB109_2 ; GFX9-NEXT: .LBB109_4: ; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 @@ -16682,17 +17115,16 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s8, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB109_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 -; GFX11-NEXT: s_lshr_b32 s6, s1, 24 -; GFX11-NEXT: s_lshr_b32 s8, s1, 16 -; GFX11-NEXT: s_lshr_b32 s7, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_lshr_b32 s7, s1, 16 +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-NEXT: s_cbranch_execnz .LBB109_4 ; GFX11-NEXT: .LBB109_2: ; %cmp.true ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 @@ -16748,16 +17180,17 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB109_3: ; GFX11-NEXT: ; implicit-def: $sgpr3 -; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: ; implicit-def: $sgpr4 ; GFX11-NEXT: ; implicit-def: $sgpr2 -; GFX11-NEXT: ; implicit-def: $sgpr7 -; GFX11-NEXT: ; implicit-def: $sgpr8 ; GFX11-NEXT: ; implicit-def: $sgpr6 -; GFX11-NEXT: s_branch .LBB109_2 +; GFX11-NEXT: ; implicit-def: $sgpr7 +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccz .LBB109_2 ; GFX11-NEXT: .LBB109_4: -; GFX11-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -17150,6 +17583,7 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB111_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xff @@ -17211,12 +17645,15 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: s_branch .LBB111_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB111_2 +; SI-NEXT: s_branch .LBB111_3 ; ; VI-LABEL: bitcast_v8i8_to_v4bf16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_mov_b64 s[6:7], -1 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -17271,12 +17708,15 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB111_4: ; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; VI-NEXT: s_branch .LBB111_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; VI-NEXT: s_cbranch_vccz .LBB111_2 +; VI-NEXT: s_branch .LBB111_3 ; ; GFX9-LABEL: bitcast_v8i8_to_v4bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], -1 ; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -17331,35 +17771,36 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB111_4: ; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB111_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX9-NEXT: s_cbranch_vccz .LBB111_2 +; GFX9-NEXT: s_branch .LBB111_3 ; ; GFX11-LABEL: bitcast_v8i8_to_v4bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_and_b32 s4, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s1, 8 -; GFX11-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-NEXT: s_or_b32 s7, s7, s8 -; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_or_b32 s5, s7, s8 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 -; GFX11-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_cbranch_execnz .LBB111_3 ; GFX11-NEXT: .LBB111_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s0, s0, 3 ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 @@ -17393,7 +17834,9 @@ define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB111_4: ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB111_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccz .LBB111_2 +; GFX11-NEXT: s_branch .LBB111_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false |
