diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 6978 |
1 files changed, 3619 insertions, 3359 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4f4687507680..6e60051bd996 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -201,6 +201,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -218,7 +219,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -232,10 +233,13 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: s_cbranch_scc0 .LBB1_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB1_3 -; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB1_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB1_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -266,16 +270,15 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: .LBB1_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB1_4: -; SI-NEXT: s_branch .LBB1_2 ; ; VI-LABEL: bitcast_v30i32_to_v30f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -293,7 +296,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -307,10 +310,13 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: s_cbranch_scc0 .LBB1_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB1_3 -; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB1_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB1_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 @@ -341,16 +347,15 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: .LBB1_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB1_4: -; VI-NEXT: s_branch .LBB1_2 ; ; GFX9-LABEL: bitcast_v30i32_to_v30f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -368,7 +373,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -382,10 +387,13 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB1_3 -; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 ; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 @@ -416,43 +424,41 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: .LBB1_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB1_4: -; GFX9-NEXT: s_branch .LBB1_2 ; ; GFX11-LABEL: bitcast_v30i32_to_v30f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB1_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB1_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB1_3: -; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 ; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 @@ -483,6 +489,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -681,6 +688,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -698,7 +706,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -712,10 +720,13 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: s_cbranch_scc0 .LBB3_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB3_3 -; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB3_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB3_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -746,16 +757,15 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: .LBB3_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB3_4: -; SI-NEXT: s_branch .LBB3_2 ; ; VI-LABEL: bitcast_v30f32_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -773,7 +783,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -787,10 +797,13 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: s_cbranch_scc0 .LBB3_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB3_3 -; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB3_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB3_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -821,16 +834,15 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: .LBB3_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB3_4: -; VI-NEXT: s_branch .LBB3_2 ; ; GFX9-LABEL: bitcast_v30f32_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -848,7 +860,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -862,10 +874,13 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB3_3 -; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB3_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -896,43 +911,41 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: .LBB3_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_branch .LBB3_2 ; ; GFX11-LABEL: bitcast_v30f32_to_v30i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB3_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB3_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB3_3: -; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -948,6 +961,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB3_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -1161,6 +1175,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -1178,7 +1193,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -1192,10 +1207,13 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: s_cbranch_scc0 .LBB5_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB5_3 -; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB5_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB5_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -1226,16 +1244,15 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: .LBB5_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB5_4: -; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v30i32_to_v15i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -1253,7 +1270,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -1267,10 +1284,13 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: s_cbranch_scc0 .LBB5_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB5_3 -; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB5_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB5_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 @@ -1301,16 +1321,15 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: .LBB5_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB5_4: -; VI-NEXT: s_branch .LBB5_2 ; ; GFX9-LABEL: bitcast_v30i32_to_v15i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -1328,7 +1347,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -1342,10 +1361,13 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB5_3 -; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB5_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 ; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 @@ -1376,43 +1398,41 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: .LBB5_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB5_4: -; GFX9-NEXT: s_branch .LBB5_2 ; ; GFX11-LABEL: bitcast_v30i32_to_v15i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB5_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB5_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB5_3: -; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 ; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 @@ -1443,6 +1463,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB5_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -1664,6 +1685,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -1681,7 +1703,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -1695,10 +1717,13 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: s_cbranch_scc0 .LBB7_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB7_3 -; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB7_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB7_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -1729,16 +1754,15 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: .LBB7_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB7_4: -; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: bitcast_v15i64_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -1756,7 +1780,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -1770,10 +1794,13 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: s_cbranch_scc0 .LBB7_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB7_3 -; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB7_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -1804,16 +1831,15 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: .LBB7_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB7_4: -; VI-NEXT: s_branch .LBB7_2 ; ; GFX9-LABEL: bitcast_v15i64_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -1831,7 +1857,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -1845,10 +1871,13 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB7_3 -; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 @@ -1879,43 +1908,41 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: .LBB7_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_branch .LBB7_2 ; ; GFX11-LABEL: bitcast_v15i64_to_v30i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB7_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB7_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB7_3: -; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo @@ -1954,6 +1981,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB7_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2167,6 +2195,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -2184,7 +2213,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -2198,10 +2227,13 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: s_cbranch_scc0 .LBB9_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB9_3 -; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB9_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB9_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 @@ -2232,16 +2264,15 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: .LBB9_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB9_4: -; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_v30i32_to_v15f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -2259,7 +2290,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -2273,10 +2304,13 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: s_cbranch_scc0 .LBB9_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB9_3 -; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB9_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 @@ -2307,16 +2341,15 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: .LBB9_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB9_4: -; VI-NEXT: s_branch .LBB9_2 ; ; GFX9-LABEL: bitcast_v30i32_to_v15f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -2334,7 +2367,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -2348,10 +2381,13 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB9_3 -; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 ; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 @@ -2382,43 +2418,41 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: .LBB9_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB9_4: -; GFX9-NEXT: s_branch .LBB9_2 ; ; GFX11-LABEL: bitcast_v30i32_to_v15f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB9_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB9_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB9_3: -; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 ; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 @@ -2449,6 +2483,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB9_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2602,6 +2637,7 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -2630,13 +2666,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB11_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB11_3 -; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB11_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB11_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -2652,17 +2691,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: .LBB11_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: v_mov_b32_e32 v17, v31 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB11_4: -; SI-NEXT: s_branch .LBB11_2 ; ; VI-LABEL: bitcast_v15f64_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -2691,13 +2729,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB11_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB11_3 -; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB11_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB11_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -2713,17 +2754,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: .LBB11_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: v_mov_b32_e32 v17, v31 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB11_4: -; VI-NEXT: s_branch .LBB11_2 ; ; GFX9-LABEL: bitcast_v15f64_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -2752,13 +2792,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB11_3 -; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB11_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -2774,44 +2817,42 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: .LBB11_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: v_mov_b32_e32 v17, v31 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB11_4: -; GFX9-NEXT: s_branch .LBB11_2 ; ; GFX11-LABEL: bitcast_v15f64_to_v30i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB11_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB11_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB11_3: -; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -2827,6 +2868,7 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB11_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3848,6 +3890,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s45, v1 ; SI-NEXT: v_readfirstlane_b32 s44, v2 ; SI-NEXT: v_readfirstlane_b32 s43, v3 @@ -3863,8 +3906,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s9, v13 ; SI-NEXT: v_readfirstlane_b32 s8, v14 ; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v1, s7 @@ -4202,7 +4245,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB13_2 +; SI-NEXT: s_branch .LBB13_3 ; ; VI-LABEL: bitcast_v30i32_to_v60i16_scalar: ; VI: ; %bb.0: @@ -4215,8 +4260,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v30, s34, 2 ; VI-NEXT: v_writelane_b32 v30, s35, 3 ; VI-NEXT: v_writelane_b32 v30, s36, 4 -; VI-NEXT: v_writelane_b32 v30, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_writelane_b32 v30, s38, 6 ; VI-NEXT: v_readfirstlane_b32 s45, v0 ; VI-NEXT: v_readfirstlane_b32 s44, v1 @@ -4232,14 +4278,14 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s10, v11 ; VI-NEXT: v_readfirstlane_b32 s9, v12 ; VI-NEXT: v_readfirstlane_b32 s8, v13 -; VI-NEXT: v_readfirstlane_b32 s6, v14 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -4270,8 +4316,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s39, s16, 16 ; VI-NEXT: s_cbranch_execnz .LBB13_3 ; VI-NEXT: .LBB13_2: ; %cmp.true -; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_add_i32 s8, s8, 3 ; VI-NEXT: s_add_i32 s9, s9, 3 ; VI-NEXT: s_add_i32 s10, s10, 3 @@ -4300,8 +4346,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -4415,12 +4461,12 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 ; VI-NEXT: s_lshl_b32 s44, s56, 16 ; VI-NEXT: s_or_b32 s8, s8, s44 -; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_lshl_b32 s44, s47, 16 ; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -4449,8 +4495,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v25, s10 ; VI-NEXT: v_mov_b32_e32 v26, s9 ; VI-NEXT: v_mov_b32_e32 v27, s8 -; VI-NEXT: v_mov_b32_e32 v28, s6 -; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_mov_b32_e32 v28, s7 +; VI-NEXT: v_mov_b32_e32 v29, s6 ; VI-NEXT: v_readlane_b32 s39, v30, 7 ; VI-NEXT: v_readlane_b32 s38, v30, 6 ; VI-NEXT: v_readlane_b32 s37, v30, 5 @@ -4495,7 +4541,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB13_2 +; VI-NEXT: s_branch .LBB13_3 ; ; GFX9-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX9: ; %bb.0: @@ -4504,45 +4552,46 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 -; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 -; GFX9-NEXT: v_readfirstlane_b32 s40, v10 -; GFX9-NEXT: v_readfirstlane_b32 s41, v11 -; GFX9-NEXT: v_readfirstlane_b32 s42, v12 -; GFX9-NEXT: v_readfirstlane_b32 s43, v13 -; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v3 +; GFX9-NEXT: v_readfirstlane_b32 s11, v4 +; GFX9-NEXT: v_readfirstlane_b32 s12, v5 +; GFX9-NEXT: v_readfirstlane_b32 s13, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s40, v9 +; GFX9-NEXT: v_readfirstlane_b32 s41, v10 +; GFX9-NEXT: v_readfirstlane_b32 s42, v11 +; GFX9-NEXT: v_readfirstlane_b32 s43, v12 +; GFX9-NEXT: v_readfirstlane_b32 s44, v13 +; GFX9-NEXT: v_readfirstlane_b32 s45, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_writelane_b32 v30, s35, 3 ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -4559,6 +4608,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s35, s16, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s45, s45, 3 ; GFX9-NEXT: s_add_i32 s44, s44, 3 ; GFX9-NEXT: s_add_i32 s43, s43, 3 @@ -4574,7 +4624,6 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s9, s9, 3 ; GFX9-NEXT: s_add_i32 s8, s8, 3 ; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -4589,22 +4638,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -4634,22 +4683,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 ; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 ; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -4664,22 +4713,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v11, s25 ; GFX9-NEXT: v_mov_b32_e32 v12, s26 ; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 -; GFX9-NEXT: v_mov_b32_e32 v24, s28 -; GFX9-NEXT: v_mov_b32_e32 v25, s29 -; GFX9-NEXT: v_mov_b32_e32 v26, s40 -; GFX9-NEXT: v_mov_b32_e32 v27, s41 -; GFX9-NEXT: v_mov_b32_e32 v28, s42 -; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_mov_b32_e32 v19, s12 +; GFX9-NEXT: v_mov_b32_e32 v20, s13 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s15 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v24, s29 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s41 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s43 +; GFX9-NEXT: v_mov_b32_e32 v29, s6 ; GFX9-NEXT: v_readlane_b32 s35, v30, 3 ; GFX9-NEXT: v_readlane_b32 s34, v30, 2 ; GFX9-NEXT: v_readlane_b32 s31, v30, 1 @@ -4720,7 +4769,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: ; implicit-def: $sgpr47 ; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB13_2 +; GFX9-NEXT: s_branch .LBB13_3 ; ; GFX11-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX11: ; %bb.0: @@ -4735,16 +4786,16 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: v_readfirstlane_b32 s10, v6 ; GFX11-NEXT: v_readfirstlane_b32 s11, v7 ; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-NEXT: v_readfirstlane_b32 s14, v9 ; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 -; GFX11-NEXT: s_mov_b32 s94, 0 +; GFX11-NEXT: v_readfirstlane_b32 s13, v11 +; GFX11-NEXT: s_mov_b32 s94, -1 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -4772,12 +4823,11 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 ; GFX11-NEXT: s_lshr_b32 s93, s0, 16 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 -; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: s_cbranch_execnz .LBB13_3 ; GFX11-NEXT: .LBB13_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s15, s15, 3 ; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 ; GFX11-NEXT: s_add_i32 s12, s12, 3 ; GFX11-NEXT: s_add_i32 s11, s11, 3 ; GFX11-NEXT: s_add_i32 s10, s10, 3 @@ -4805,9 +4855,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -4864,9 +4914,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 ; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 ; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42 ; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 @@ -4880,8 +4930,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 ; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 ; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14 +; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB13_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -4914,7 +4964,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-NEXT: s_cbranch_vccz .LBB13_2 +; GFX11-NEXT: s_branch .LBB13_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6332,6 +6384,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v35, v22 ; SI-NEXT: v_mov_b32_e32 v36, v20 ; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 @@ -6363,7 +6416,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 @@ -6666,7 +6719,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: s_branch .LBB15_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB15_2 +; SI-NEXT: s_branch .LBB15_3 ; ; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: ; VI: ; %bb.0: @@ -6686,6 +6741,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -6702,7 +6758,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -6949,11 +7005,28 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB15_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB15_2 +; VI-NEXT: s_branch .LBB15_3 ; ; GFX9-LABEL: bitcast_v60i16_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -6970,21 +7043,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -7005,7 +7064,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -7020,6 +7078,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -7165,7 +7224,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB15_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB15_2 +; GFX9-NEXT: s_branch .LBB15_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -7210,41 +7271,41 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -7259,17 +7320,16 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -7283,24 +7343,24 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -7317,7 +7377,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB15_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB15_2 +; GFX11-TRUE16-NEXT: s_branch .LBB15_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -7350,41 +7412,41 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -7399,17 +7461,16 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -7423,24 +7484,24 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -7457,7 +7518,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB15_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB15_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB15_2 +; GFX11-FAKE16-NEXT: s_branch .LBB15_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8835,6 +8898,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s45, v1 ; SI-NEXT: v_readfirstlane_b32 s44, v2 ; SI-NEXT: v_readfirstlane_b32 s43, v3 @@ -8847,11 +8911,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s12, v10 ; SI-NEXT: v_readfirstlane_b32 s11, v11 ; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -8867,13 +8931,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 @@ -8931,10 +8995,10 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 @@ -8989,10 +9053,10 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: s_lshr_b32 s5, s17, 16 ; SI-NEXT: s_lshr_b32 s46, s18, 16 @@ -9019,14 +9083,14 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s91, s12, 16 ; SI-NEXT: s_lshr_b32 s92, s11, 16 ; SI-NEXT: s_lshr_b32 s93, s10, 16 -; SI-NEXT: s_lshr_b32 s94, s8, 16 -; SI-NEXT: s_lshr_b32 s95, s7, 16 -; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s8, 16 +; SI-NEXT: s_lshr_b32 vcc_lo, s7, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 @@ -9373,7 +9437,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB17_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB17_2 +; SI-NEXT: s_branch .LBB17_3 ; ; VI-LABEL: bitcast_v30i32_to_v60f16_scalar: ; VI: ; %bb.0: @@ -9386,8 +9452,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: v_writelane_b32 v30, s34, 2 ; VI-NEXT: v_writelane_b32 v30, s35, 3 ; VI-NEXT: v_writelane_b32 v30, s36, 4 -; VI-NEXT: v_writelane_b32 v30, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_writelane_b32 v30, s38, 6 ; VI-NEXT: v_readfirstlane_b32 s45, v0 ; VI-NEXT: v_readfirstlane_b32 s44, v1 @@ -9403,14 +9470,14 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s10, v11 ; VI-NEXT: v_readfirstlane_b32 s9, v12 ; VI-NEXT: v_readfirstlane_b32 s8, v13 -; VI-NEXT: v_readfirstlane_b32 s6, v14 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB17_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -9441,8 +9508,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: s_lshr_b32 s39, s16, 16 ; VI-NEXT: s_cbranch_execnz .LBB17_3 ; VI-NEXT: .LBB17_2: ; %cmp.true -; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 ; VI-NEXT: s_add_i32 s8, s8, 3 ; VI-NEXT: s_add_i32 s9, s9, 3 ; VI-NEXT: s_add_i32 s10, s10, 3 @@ -9471,8 +9538,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: s_add_i32 s17, s17, 3 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -9586,12 +9653,12 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: s_and_b32 s8, 0xffff, s8 ; VI-NEXT: s_lshl_b32 s44, s56, 16 ; VI-NEXT: s_or_b32 s8, s8, s44 -; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_lshl_b32 s44, s47, 16 ; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -9620,8 +9687,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v25, s10 ; VI-NEXT: v_mov_b32_e32 v26, s9 ; VI-NEXT: v_mov_b32_e32 v27, s8 -; VI-NEXT: v_mov_b32_e32 v28, s6 -; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_mov_b32_e32 v28, s7 +; VI-NEXT: v_mov_b32_e32 v29, s6 ; VI-NEXT: v_readlane_b32 s39, v30, 7 ; VI-NEXT: v_readlane_b32 s38, v30, 6 ; VI-NEXT: v_readlane_b32 s37, v30, 5 @@ -9666,7 +9733,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB17_2 +; VI-NEXT: s_branch .LBB17_3 ; ; GFX9-LABEL: bitcast_v30i32_to_v60f16_scalar: ; GFX9: ; %bb.0: @@ -9675,45 +9744,46 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 -; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 -; GFX9-NEXT: v_readfirstlane_b32 s40, v10 -; GFX9-NEXT: v_readfirstlane_b32 s41, v11 -; GFX9-NEXT: v_readfirstlane_b32 s42, v12 -; GFX9-NEXT: v_readfirstlane_b32 s43, v13 -; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v3 +; GFX9-NEXT: v_readfirstlane_b32 s11, v4 +; GFX9-NEXT: v_readfirstlane_b32 s12, v5 +; GFX9-NEXT: v_readfirstlane_b32 s13, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s40, v9 +; GFX9-NEXT: v_readfirstlane_b32 s41, v10 +; GFX9-NEXT: v_readfirstlane_b32 s42, v11 +; GFX9-NEXT: v_readfirstlane_b32 s43, v12 +; GFX9-NEXT: v_readfirstlane_b32 s44, v13 +; GFX9-NEXT: v_readfirstlane_b32 s45, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_writelane_b32 v30, s35, 3 ; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -9730,6 +9800,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s35, s16, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB17_3 ; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s45, s45, 3 ; GFX9-NEXT: s_add_i32 s44, s44, 3 ; GFX9-NEXT: s_add_i32 s43, s43, 3 @@ -9745,7 +9816,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s9, s9, 3 ; GFX9-NEXT: s_add_i32 s8, s8, 3 ; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: s_add_i32 s6, s6, 3 ; GFX9-NEXT: s_add_i32 s29, s29, 3 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_add_i32 s27, s27, 3 @@ -9760,22 +9830,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_add_i32 s17, s17, 3 ; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -9805,22 +9875,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 ; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 ; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -9835,22 +9905,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v11, s25 ; GFX9-NEXT: v_mov_b32_e32 v12, s26 ; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 -; GFX9-NEXT: v_mov_b32_e32 v24, s28 -; GFX9-NEXT: v_mov_b32_e32 v25, s29 -; GFX9-NEXT: v_mov_b32_e32 v26, s40 -; GFX9-NEXT: v_mov_b32_e32 v27, s41 -; GFX9-NEXT: v_mov_b32_e32 v28, s42 -; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_mov_b32_e32 v19, s12 +; GFX9-NEXT: v_mov_b32_e32 v20, s13 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s15 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v24, s29 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s41 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s43 +; GFX9-NEXT: v_mov_b32_e32 v29, s6 ; GFX9-NEXT: v_readlane_b32 s35, v30, 3 ; GFX9-NEXT: v_readlane_b32 s34, v30, 2 ; GFX9-NEXT: v_readlane_b32 s31, v30, 1 @@ -9891,7 +9961,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: ; implicit-def: $sgpr47 ; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB17_2 +; GFX9-NEXT: s_branch .LBB17_3 ; ; GFX11-LABEL: bitcast_v30i32_to_v60f16_scalar: ; GFX11: ; %bb.0: @@ -9906,16 +9978,16 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: v_readfirstlane_b32 s10, v6 ; GFX11-NEXT: v_readfirstlane_b32 s11, v7 ; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-NEXT: v_readfirstlane_b32 s14, v9 ; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 -; GFX11-NEXT: s_mov_b32 s94, 0 +; GFX11-NEXT: v_readfirstlane_b32 s13, v11 +; GFX11-NEXT: s_mov_b32 s94, -1 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -9943,12 +10015,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 ; GFX11-NEXT: s_lshr_b32 s93, s0, 16 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 -; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: s_cbranch_execnz .LBB17_3 ; GFX11-NEXT: .LBB17_2: ; %cmp.true -; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s15, s15, 3 ; GFX11-NEXT: s_add_i32 s13, s13, 3 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 ; GFX11-NEXT: s_add_i32 s12, s12, 3 ; GFX11-NEXT: s_add_i32 s11, s11, 3 ; GFX11-NEXT: s_add_i32 s10, s10, 3 @@ -9976,9 +10047,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: s_add_i32 s2, s2, 3 ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -10035,9 +10106,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 ; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 ; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42 ; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 @@ -10051,8 +10122,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 ; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 ; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14 +; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB17_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -10085,7 +10156,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-NEXT: s_cbranch_vccz .LBB17_2 +; GFX11-NEXT: s_branch .LBB17_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11696,11 +11769,11 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 @@ -11714,83 +11787,92 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11799,260 +11881,240 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_or_b32_e32 v8, v49, v8 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v45, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_mov_b32_e32 v35, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB19_3 ; SI-NEXT: .LBB19_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v52, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v43, v62 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: .LBB19_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -12060,42 +12122,48 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -12108,65 +12176,65 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -12178,14 +12246,14 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -12193,9 +12261,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -12205,7 +12273,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -12217,32 +12285,39 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -12327,6 +12402,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -12343,7 +12419,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB19_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -12547,11 +12623,28 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB19_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB19_2 +; VI-NEXT: s_branch .LBB19_3 ; ; GFX9-LABEL: bitcast_v60f16_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -12568,21 +12661,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -12603,7 +12682,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -12618,6 +12696,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -12765,7 +12844,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB19_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB19_2 +; GFX9-NEXT: s_branch .LBB19_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -12810,41 +12891,41 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -12859,17 +12940,16 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -12883,24 +12963,24 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -12917,7 +12997,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB19_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB19_2 +; GFX11-TRUE16-NEXT: s_branch .LBB19_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -12950,41 +13032,41 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -12999,17 +13081,16 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_3 ; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -13023,24 +13104,24 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -13057,7 +13138,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB19_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB19_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB19_2 +; GFX11-FAKE16-NEXT: s_branch .LBB19_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13255,6 +13338,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -13272,7 +13356,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -13286,10 +13370,13 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: s_cbranch_scc0 .LBB21_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB21_3 -; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB21_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB21_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -13320,16 +13407,15 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: .LBB21_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB21_4: -; SI-NEXT: s_branch .LBB21_2 ; ; VI-LABEL: bitcast_v30f32_to_v15i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -13347,7 +13433,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -13361,10 +13447,13 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: s_cbranch_scc0 .LBB21_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB21_3 -; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB21_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -13395,16 +13484,15 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: .LBB21_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB21_4: -; VI-NEXT: s_branch .LBB21_2 ; ; GFX9-LABEL: bitcast_v30f32_to_v15i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -13422,7 +13510,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -13436,10 +13524,13 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB21_3 -; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -13470,43 +13561,41 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: .LBB21_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB21_4: -; GFX9-NEXT: s_branch .LBB21_2 ; ; GFX11-LABEL: bitcast_v30f32_to_v15i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB21_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB21_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB21_3: -; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -13522,6 +13611,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB21_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13743,6 +13833,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -13760,7 +13851,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -13774,10 +13865,13 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: s_cbranch_scc0 .LBB23_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB23_3 -; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB23_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB23_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -13808,16 +13902,15 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: .LBB23_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB23_4: -; SI-NEXT: s_branch .LBB23_2 ; ; VI-LABEL: bitcast_v15i64_to_v30f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -13835,7 +13928,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -13849,10 +13942,13 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: s_cbranch_scc0 .LBB23_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB23_3 -; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB23_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -13883,16 +13979,15 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: .LBB23_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB23_4: -; VI-NEXT: s_branch .LBB23_2 ; ; GFX9-LABEL: bitcast_v15i64_to_v30f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -13910,7 +14005,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -13924,10 +14019,13 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB23_3 -; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 @@ -13958,43 +14056,41 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: .LBB23_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB23_4: -; GFX9-NEXT: s_branch .LBB23_2 ; ; GFX11-LABEL: bitcast_v15i64_to_v30f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB23_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB23_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB23_3: -; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo @@ -14033,6 +14129,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB23_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14231,6 +14328,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -14248,7 +14346,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -14262,10 +14360,13 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: s_cbranch_scc0 .LBB25_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB25_3 -; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB25_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB25_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -14296,16 +14397,15 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: .LBB25_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB25_4: -; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v30f32_to_v15f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -14323,7 +14423,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -14337,10 +14437,13 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: s_cbranch_scc0 .LBB25_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB25_3 -; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB25_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -14371,16 +14474,15 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: .LBB25_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB25_4: -; VI-NEXT: s_branch .LBB25_2 ; ; GFX9-LABEL: bitcast_v30f32_to_v15f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -14398,7 +14500,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -14412,10 +14514,13 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB25_3 -; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 @@ -14446,43 +14551,41 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: .LBB25_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB25_4: -; GFX9-NEXT: s_branch .LBB25_2 ; ; GFX11-LABEL: bitcast_v30f32_to_v15f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB25_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB25_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB25_3: -; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -14498,6 +14601,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg % ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB25_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14651,6 +14755,7 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -14679,13 +14784,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB27_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB27_3 -; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB27_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB27_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -14701,17 +14809,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: .LBB27_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: v_mov_b32_e32 v17, v31 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB27_4: -; SI-NEXT: s_branch .LBB27_2 ; ; VI-LABEL: bitcast_v15f64_to_v30f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -14740,13 +14847,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB27_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB27_3 -; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB27_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -14762,17 +14872,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: .LBB27_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: v_mov_b32_e32 v17, v31 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB27_4: -; VI-NEXT: s_branch .LBB27_2 ; ; GFX9-LABEL: bitcast_v15f64_to_v30f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -14801,13 +14910,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB27_3 -; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -14823,44 +14935,42 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: .LBB27_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: v_mov_b32_e32 v17, v31 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB27_4: -; GFX9-NEXT: s_branch .LBB27_2 ; ; GFX11-LABEL: bitcast_v15f64_to_v30f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB27_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB27_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB27_3: -; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -14876,6 +14986,7 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB27_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15867,12 +15978,13 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v29, s16 ; SI-NEXT: v_mov_b32_e32 v28, s17 ; SI-NEXT: v_mov_b32_e32 v33, s18 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v32, s19 -; SI-NEXT: v_mov_b32_e32 v29, s20 +; SI-NEXT: v_mov_b32_e32 v30, s20 ; SI-NEXT: v_mov_b32_e32 v27, s21 ; SI-NEXT: v_mov_b32_e32 v25, s22 ; SI-NEXT: v_mov_b32_e32 v24, s23 @@ -15909,11 +16021,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 ; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: v_alignbit_b32 v44, v27, v30, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_alignbit_b32 v56, v28, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 @@ -15936,11 +16048,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 @@ -15977,11 +16089,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 ; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 ; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: v_alignbit_b32 v44, v27, v30, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_alignbit_b32 v56, v28, v29, 16 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 @@ -16002,31 +16114,31 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_or_b32_e32 v30, v30, v56 -; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v29, v29, v56 +; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v60 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v60 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v46 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 @@ -16227,19 +16339,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB29_2 +; SI-NEXT: s_branch .LBB29_3 ; ; VI-LABEL: bitcast_v30f32_to_v60i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v28, s19 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v29, s20 -; VI-NEXT: v_mov_b32_e32 v27, s21 +; VI-NEXT: v_mov_b32_e32 v28, s21 ; VI-NEXT: v_mov_b32_e32 v26, s22 ; VI-NEXT: v_mov_b32_e32 v25, s23 ; VI-NEXT: v_mov_b32_e32 v24, s24 @@ -16286,9 +16401,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 @@ -16318,9 +16433,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 @@ -16348,9 +16463,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 @@ -16362,11 +16477,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 ; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 ; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 ; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 @@ -16476,19 +16591,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB29_2 +; VI-NEXT: s_branch .LBB29_3 ; ; GFX9-LABEL: bitcast_v30f32_to_v60i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v19, s16 ; GFX9-NEXT: v_mov_b32_e32 v18, s17 ; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v28, s19 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v29, s20 -; GFX9-NEXT: v_mov_b32_e32 v27, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s21 ; GFX9-NEXT: v_mov_b32_e32 v26, s22 ; GFX9-NEXT: v_mov_b32_e32 v25, s23 ; GFX9-NEXT: v_mov_b32_e32 v24, s24 @@ -16535,9 +16653,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 @@ -16567,9 +16685,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 @@ -16597,20 +16715,20 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: .LBB29_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 @@ -16725,7 +16843,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB29_2 +; GFX9-NEXT: s_branch .LBB29_3 ; ; GFX11-LABEL: bitcast_v30f32_to_v60i16_scalar: ; GFX11: ; %bb.0: @@ -16738,10 +16858,10 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 ; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 ; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 -; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v15, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 @@ -16756,9 +16876,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17 ; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18 @@ -16774,8 +16894,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28 ; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29 ; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-NEXT: s_cbranch_execnz .LBB29_3 ; GFX11-NEXT: .LBB29_2: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 ; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 @@ -16783,8 +16902,8 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 @@ -16804,9 +16923,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17 ; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18 @@ -16837,7 +16956,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19 ; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v15 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 @@ -16853,7 +16972,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14 ; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21 -; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v17 ; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18 ; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 @@ -16865,7 +16984,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28 ; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22 ; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2 ; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3 @@ -16881,7 +17000,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24 ; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11 -; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16 ; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2 ; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3 @@ -16924,7 +17043,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: ; implicit-def: $vgpr52 ; GFX11-NEXT: ; implicit-def: $vgpr51 ; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB29_2 +; GFX11-NEXT: s_branch .LBB29_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -18342,6 +18463,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v35, v22 ; SI-NEXT: v_mov_b32_e32 v36, v20 ; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 @@ -18373,7 +18495,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 @@ -18676,7 +18798,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: s_branch .LBB31_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB31_2 +; SI-NEXT: s_branch .LBB31_3 ; ; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: ; VI: ; %bb.0: @@ -18696,6 +18820,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -18712,7 +18837,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB31_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -18959,11 +19084,28 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB31_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB31_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB31_2 +; VI-NEXT: s_branch .LBB31_3 ; ; GFX9-LABEL: bitcast_v60i16_to_v30f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -18980,21 +19122,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -19015,7 +19143,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -19030,6 +19157,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -19175,7 +19303,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB31_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB31_2 +; GFX9-NEXT: s_branch .LBB31_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -19220,41 +19350,41 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -19269,17 +19399,16 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -19293,24 +19422,24 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -19327,7 +19456,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB31_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB31_2 +; GFX11-TRUE16-NEXT: s_branch .LBB31_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -19360,41 +19491,41 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -19409,17 +19540,16 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_3 ; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -19433,24 +19563,24 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -19467,7 +19597,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB31_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB31_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB31_2 +; GFX11-FAKE16-NEXT: s_branch .LBB31_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -20815,6 +20947,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s45, v1 ; SI-NEXT: v_readfirstlane_b32 s44, v2 ; SI-NEXT: v_readfirstlane_b32 s43, v3 @@ -20827,11 +20960,11 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_readfirstlane_b32 s12, v10 ; SI-NEXT: v_readfirstlane_b32 s11, v11 ; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -20850,14 +20983,14 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 ; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 ; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 @@ -20912,10 +21045,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 @@ -20969,7 +21102,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 -; SI-NEXT: v_add_f32_e64 v40, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v40, s7, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v40 @@ -20983,7 +21116,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v19, s26, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: v_add_f32_e64 v48, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v48, s9, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v48 @@ -21032,8 +21165,8 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 ; SI-NEXT: v_add_f32_e64 v20, s14, 1.0 ; SI-NEXT: v_add_f32_e64 v33, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v52, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v44, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v44, s6, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 @@ -21375,19 +21508,22 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB33_2 +; SI-NEXT: s_branch .LBB33_3 ; ; VI-LABEL: bitcast_v30f32_to_v60f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_mov_b32_e32 v17, s18 -; VI-NEXT: v_mov_b32_e32 v28, s19 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v29, s20 -; VI-NEXT: v_mov_b32_e32 v27, s21 +; VI-NEXT: v_mov_b32_e32 v28, s21 ; VI-NEXT: v_mov_b32_e32 v26, s22 ; VI-NEXT: v_mov_b32_e32 v25, s23 ; VI-NEXT: v_mov_b32_e32 v24, s24 @@ -21434,9 +21570,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 @@ -21466,9 +21602,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 @@ -21496,9 +21632,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 @@ -21510,11 +21646,11 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 ; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 ; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 ; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 @@ -21624,19 +21760,22 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB33_2 +; VI-NEXT: s_branch .LBB33_3 ; ; GFX9-LABEL: bitcast_v30f32_to_v60f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v19, s16 ; GFX9-NEXT: v_mov_b32_e32 v18, s17 ; GFX9-NEXT: v_mov_b32_e32 v17, s18 -; GFX9-NEXT: v_mov_b32_e32 v28, s19 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v29, s20 -; GFX9-NEXT: v_mov_b32_e32 v27, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s21 ; GFX9-NEXT: v_mov_b32_e32 v26, s22 ; GFX9-NEXT: v_mov_b32_e32 v25, s23 ; GFX9-NEXT: v_mov_b32_e32 v24, s24 @@ -21683,9 +21822,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 @@ -21715,9 +21854,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 @@ -21745,20 +21884,20 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: .LBB33_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 @@ -21873,7 +22012,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB33_2 +; GFX9-NEXT: s_branch .LBB33_3 ; ; GFX11-LABEL: bitcast_v30f32_to_v60f16_scalar: ; GFX11: ; %bb.0: @@ -21886,10 +22027,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 ; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 ; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 -; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v15, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB33_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 @@ -21904,9 +22045,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17 ; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18 @@ -21922,8 +22063,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28 ; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29 ; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-NEXT: s_cbranch_execnz .LBB33_3 ; GFX11-NEXT: .LBB33_2: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 ; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 @@ -21931,8 +22071,8 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 @@ -21952,9 +22092,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17 ; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18 @@ -21985,7 +22125,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19 ; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v15 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 @@ -22001,7 +22141,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14 ; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21 -; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v17 ; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18 ; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 @@ -22013,7 +22153,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28 ; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22 ; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2 ; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3 @@ -22029,7 +22169,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24 ; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11 -; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16 ; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2 ; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3 @@ -22072,7 +22212,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; GFX11-NEXT: ; implicit-def: $vgpr52 ; GFX11-NEXT: ; implicit-def: $vgpr51 ; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB33_2 +; GFX11-NEXT: s_branch .LBB33_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -23683,11 +23825,11 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 @@ -23701,83 +23843,92 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23786,260 +23937,240 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_or_b32_e32 v8, v49, v8 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v45, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_mov_b32_e32 v35, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB35_3 ; SI-NEXT: .LBB35_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v52, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v43, v62 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: .LBB35_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB35_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -24047,42 +24178,48 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -24095,65 +24232,65 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -24165,14 +24302,14 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -24180,9 +24317,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -24192,7 +24329,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -24204,32 +24341,39 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -24314,6 +24458,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -24330,7 +24475,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB35_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -24534,11 +24679,28 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB35_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB35_2 +; VI-NEXT: s_branch .LBB35_3 ; ; GFX9-LABEL: bitcast_v60f16_to_v30f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -24555,21 +24717,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -24590,7 +24738,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -24605,6 +24752,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -24752,7 +24900,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB35_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB35_2 +; GFX9-NEXT: s_branch .LBB35_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -24797,41 +24947,41 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -24846,17 +24996,16 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -24870,24 +25019,24 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -24904,7 +25053,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB35_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB35_2 +; GFX11-TRUE16-NEXT: s_branch .LBB35_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -24937,41 +25088,41 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -24986,17 +25137,16 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 ; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -25010,24 +25160,24 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -25044,7 +25194,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB35_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB35_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB35_2 +; GFX11-FAKE16-NEXT: s_branch .LBB35_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -25265,6 +25417,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -25282,7 +25435,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -25296,10 +25449,13 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_mov_b32_e32 v11, s27 ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: s_cbranch_scc0 .LBB37_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB37_3 -; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB37_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB37_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 @@ -25330,16 +25486,15 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: .LBB37_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB37_4: -; SI-NEXT: s_branch .LBB37_2 ; ; VI-LABEL: bitcast_v15i64_to_v15f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -25357,7 +25512,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -25371,10 +25526,13 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: s_cbranch_scc0 .LBB37_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB37_3 -; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB37_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB37_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -25405,16 +25563,15 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: .LBB37_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB37_4: -; VI-NEXT: s_branch .LBB37_2 ; ; GFX9-LABEL: bitcast_v15i64_to_v15f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -25432,7 +25589,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -25446,10 +25603,13 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB37_3 -; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB37_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -25480,43 +25640,41 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: .LBB37_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB37_4: -; GFX9-NEXT: s_branch .LBB37_2 ; ; GFX11-LABEL: bitcast_v15i64_to_v15f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB37_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB37_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB37_3: -; GFX11-NEXT: .LBB37_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -25555,6 +25713,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, ; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: .LBB37_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -25708,6 +25867,7 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v29, v15 ; SI-NEXT: v_mov_b32_e32 v28, v14 ; SI-NEXT: v_mov_b32_e32 v27, v13 @@ -25736,13 +25896,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: s_cbranch_scc0 .LBB39_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB39_3 -; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: .LBB39_2: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB39_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -25758,17 +25921,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: .LBB39_4: ; %end ; SI-NEXT: v_mov_b32_e32 v16, v30 ; SI-NEXT: v_mov_b32_e32 v17, v31 ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB39_4: -; SI-NEXT: s_branch .LBB39_2 ; ; VI-LABEL: bitcast_v15f64_to_v15i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 ; VI-NEXT: v_mov_b32_e32 v27, v13 @@ -25797,13 +25959,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: s_cbranch_scc0 .LBB39_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB39_3 -; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB39_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB39_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -25819,17 +25984,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: .LBB39_4: ; %end ; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: v_mov_b32_e32 v17, v31 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB39_4: -; VI-NEXT: s_branch .LBB39_2 ; ; GFX9-LABEL: bitcast_v15f64_to_v15i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v29, v15 ; GFX9-NEXT: v_mov_b32_e32 v28, v14 ; GFX9-NEXT: v_mov_b32_e32 v27, v13 @@ -25858,13 +26022,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 -; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB39_3 -; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB39_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -25880,44 +26047,42 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: .LBB39_4: ; %end ; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: v_mov_b32_e32 v17, v31 ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB39_4: -; GFX9-NEXT: s_branch .LBB39_2 ; ; GFX11-LABEL: bitcast_v15f64_to_v15i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 -; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 -; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 -; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 -; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 -; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 -; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 -; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 -; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16 +; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18 +; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20 +; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22 +; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24 +; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28 +; GFX11-NEXT: v_mov_b32_e32 v17, s29 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo -; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 -; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: .LBB39_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccz .LBB39_4 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_setpc_b64 s[30:31] -; GFX11-NEXT: .LBB39_3: -; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -25933,6 +26098,7 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: .LBB39_4: ; %end ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -26970,6 +27136,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s45, v1 ; SI-NEXT: v_readfirstlane_b32 s44, v2 ; SI-NEXT: v_readfirstlane_b32 s43, v3 @@ -26985,8 +27152,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s9, v13 ; SI-NEXT: v_readfirstlane_b32 s8, v14 ; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_mov_b32_e32 v1, s7 @@ -27324,7 +27491,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB41_2 +; SI-NEXT: s_branch .LBB41_3 ; ; VI-LABEL: bitcast_v15i64_to_v60i16_scalar: ; VI: ; %bb.0: @@ -27337,8 +27506,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_writelane_b32 v30, s34, 2 ; VI-NEXT: v_writelane_b32 v30, s35, 3 ; VI-NEXT: v_writelane_b32 v30, s36, 4 -; VI-NEXT: v_writelane_b32 v30, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_writelane_b32 v30, s38, 6 ; VI-NEXT: v_readfirstlane_b32 s45, v0 ; VI-NEXT: v_readfirstlane_b32 s44, v1 @@ -27354,14 +27524,14 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s10, v11 ; VI-NEXT: v_readfirstlane_b32 s9, v12 ; VI-NEXT: v_readfirstlane_b32 s8, v13 -; VI-NEXT: v_readfirstlane_b32 s6, v14 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB41_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -27392,8 +27562,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s39, s16, 16 ; VI-NEXT: s_cbranch_execnz .LBB41_3 ; VI-NEXT: .LBB41_2: ; %cmp.true -; VI-NEXT: s_add_u32 s6, s6, 3 -; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s7, s7, 3 +; VI-NEXT: s_addc_u32 s6, s6, 0 ; VI-NEXT: s_add_u32 s9, s9, 3 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: s_add_u32 s11, s11, 3 @@ -27422,8 +27592,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s19, s19, 0 ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -27537,12 +27707,12 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: s_and_b32 s8, 0xffff, s8 ; VI-NEXT: s_lshl_b32 s44, s56, 16 ; VI-NEXT: s_or_b32 s8, s8, s44 -; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_lshl_b32 s44, s47, 16 ; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -27571,8 +27741,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v25, s10 ; VI-NEXT: v_mov_b32_e32 v26, s9 ; VI-NEXT: v_mov_b32_e32 v27, s8 -; VI-NEXT: v_mov_b32_e32 v28, s6 -; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_mov_b32_e32 v28, s7 +; VI-NEXT: v_mov_b32_e32 v29, s6 ; VI-NEXT: v_readlane_b32 s39, v30, 7 ; VI-NEXT: v_readlane_b32 s38, v30, 6 ; VI-NEXT: v_readlane_b32 s37, v30, 5 @@ -27617,7 +27787,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB41_2 +; VI-NEXT: s_branch .LBB41_3 ; ; GFX9-LABEL: bitcast_v15i64_to_v60i16_scalar: ; GFX9: ; %bb.0: @@ -27626,45 +27798,46 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 -; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 -; GFX9-NEXT: v_readfirstlane_b32 s40, v10 -; GFX9-NEXT: v_readfirstlane_b32 s41, v11 -; GFX9-NEXT: v_readfirstlane_b32 s42, v12 -; GFX9-NEXT: v_readfirstlane_b32 s43, v13 -; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v3 +; GFX9-NEXT: v_readfirstlane_b32 s11, v4 +; GFX9-NEXT: v_readfirstlane_b32 s12, v5 +; GFX9-NEXT: v_readfirstlane_b32 s13, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s40, v9 +; GFX9-NEXT: v_readfirstlane_b32 s41, v10 +; GFX9-NEXT: v_readfirstlane_b32 s42, v11 +; GFX9-NEXT: v_readfirstlane_b32 s43, v12 +; GFX9-NEXT: v_readfirstlane_b32 s44, v13 +; GFX9-NEXT: v_readfirstlane_b32 s45, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_writelane_b32 v30, s35, 3 ; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -27681,22 +27854,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s35, s16, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB41_3 ; GFX9-NEXT: .LBB41_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s44, s44, 3 -; GFX9-NEXT: s_addc_u32 s45, s45, 0 -; GFX9-NEXT: s_add_u32 s42, s42, 3 -; GFX9-NEXT: s_addc_u32 s43, s43, 0 -; GFX9-NEXT: s_add_u32 s40, s40, 3 -; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s45, s45, 3 +; GFX9-NEXT: s_addc_u32 s6, s6, 0 +; GFX9-NEXT: s_add_u32 s43, s43, 3 +; GFX9-NEXT: s_addc_u32 s44, s44, 0 +; GFX9-NEXT: s_add_u32 s41, s41, 3 +; GFX9-NEXT: s_addc_u32 s42, s42, 0 +; GFX9-NEXT: s_add_u32 s15, s15, 3 +; GFX9-NEXT: s_addc_u32 s40, s40, 0 +; GFX9-NEXT: s_add_u32 s13, s13, 3 +; GFX9-NEXT: s_addc_u32 s14, s14, 0 +; GFX9-NEXT: s_add_u32 s11, s11, 3 +; GFX9-NEXT: s_addc_u32 s12, s12, 0 +; GFX9-NEXT: s_add_u32 s9, s9, 3 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s7, s7, 3 +; GFX9-NEXT: s_addc_u32 s8, s8, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -27711,22 +27884,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -27756,22 +27929,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 ; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 ; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -27786,22 +27959,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v11, s25 ; GFX9-NEXT: v_mov_b32_e32 v12, s26 ; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 -; GFX9-NEXT: v_mov_b32_e32 v24, s28 -; GFX9-NEXT: v_mov_b32_e32 v25, s29 -; GFX9-NEXT: v_mov_b32_e32 v26, s40 -; GFX9-NEXT: v_mov_b32_e32 v27, s41 -; GFX9-NEXT: v_mov_b32_e32 v28, s42 -; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_mov_b32_e32 v19, s12 +; GFX9-NEXT: v_mov_b32_e32 v20, s13 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s15 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v24, s29 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s41 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s43 +; GFX9-NEXT: v_mov_b32_e32 v29, s6 ; GFX9-NEXT: v_readlane_b32 s35, v30, 3 ; GFX9-NEXT: v_readlane_b32 s34, v30, 2 ; GFX9-NEXT: v_readlane_b32 s31, v30, 1 @@ -27842,7 +28015,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: ; implicit-def: $sgpr47 ; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB41_2 +; GFX9-NEXT: s_branch .LBB41_3 ; ; GFX11-LABEL: bitcast_v15i64_to_v60i16_scalar: ; GFX11: ; %bb.0: @@ -27857,16 +28032,16 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: v_readfirstlane_b32 s10, v6 ; GFX11-NEXT: v_readfirstlane_b32 s11, v7 ; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-NEXT: v_readfirstlane_b32 s14, v9 ; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 -; GFX11-NEXT: s_mov_b32 s94, 0 +; GFX11-NEXT: v_readfirstlane_b32 s13, v11 +; GFX11-NEXT: s_mov_b32 s94, -1 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -27894,13 +28069,12 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 ; GFX11-NEXT: s_lshr_b32 s93, s0, 16 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 -; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-NEXT: s_cbranch_execnz .LBB41_3 ; GFX11-NEXT: .LBB41_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s15, s15, 3 -; GFX11-NEXT: s_addc_u32 s14, s14, 0 -; GFX11-NEXT: s_add_u32 s12, s12, 3 ; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s14, s14, 0 ; GFX11-NEXT: s_add_u32 s10, s10, 3 ; GFX11-NEXT: s_addc_u32 s11, s11, 0 ; GFX11-NEXT: s_add_u32 s8, s8, 3 @@ -27927,9 +28101,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -27986,9 +28160,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 ; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 ; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42 ; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 @@ -28002,8 +28176,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 ; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 ; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14 +; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB41_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -28036,7 +28210,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-NEXT: s_cbranch_vccz .LBB41_2 +; GFX11-NEXT: s_branch .LBB41_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -29454,6 +29630,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v35, v22 ; SI-NEXT: v_mov_b32_e32 v36, v20 ; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 @@ -29485,7 +29662,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 @@ -29788,7 +29965,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: s_branch .LBB43_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB43_2 +; SI-NEXT: s_branch .LBB43_3 ; ; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: ; VI: ; %bb.0: @@ -29808,6 +29987,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -29824,7 +30004,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB43_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -30071,11 +30251,28 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB43_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB43_2 +; VI-NEXT: s_branch .LBB43_3 ; ; GFX9-LABEL: bitcast_v60i16_to_v15i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -30092,21 +30289,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -30127,7 +30310,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -30142,6 +30324,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -30287,7 +30470,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB43_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB43_2 +; GFX9-NEXT: s_branch .LBB43_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -30332,41 +30517,41 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -30381,17 +30566,16 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -30405,24 +30589,24 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -30439,7 +30623,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB43_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB43_2 +; GFX11-TRUE16-NEXT: s_branch .LBB43_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -30472,41 +30658,41 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -30521,17 +30707,16 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 ; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -30545,24 +30730,24 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -30579,7 +30764,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB43_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB43_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB43_2 +; GFX11-FAKE16-NEXT: s_branch .LBB43_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -31974,6 +32161,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s44, v1 ; SI-NEXT: v_readfirstlane_b32 s45, v2 ; SI-NEXT: v_readfirstlane_b32 s42, v3 @@ -31986,11 +32174,11 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s13, v10 ; SI-NEXT: v_readfirstlane_b32 s10, v11 ; SI-NEXT: v_readfirstlane_b32 s11, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 ; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -32006,13 +32194,13 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: s_lshr_b32 s4, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_lshr_b32 s4, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 @@ -32070,10 +32258,10 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s16, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 @@ -32154,18 +32342,18 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: s_addc_u32 s11, s11, 0 ; SI-NEXT: s_lshr_b32 s92, s10, 16 ; SI-NEXT: s_lshr_b32 s93, s11, 16 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_lshr_b32 s94, s7, 16 -; SI-NEXT: s_lshr_b32 s95, s8, 16 -; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_add_u32 s8, s8, 3 ; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s94, s8, 16 +; SI-NEXT: s_lshr_b32 s95, s9, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 ; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 -; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 @@ -32512,7 +32700,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB45_2 +; SI-NEXT: s_branch .LBB45_3 ; ; VI-LABEL: bitcast_v15i64_to_v60f16_scalar: ; VI: ; %bb.0: @@ -32525,8 +32715,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: v_writelane_b32 v30, s34, 2 ; VI-NEXT: v_writelane_b32 v30, s35, 3 ; VI-NEXT: v_writelane_b32 v30, s36, 4 -; VI-NEXT: v_writelane_b32 v30, s37, 5 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_writelane_b32 v30, s38, 6 ; VI-NEXT: v_readfirstlane_b32 s45, v0 ; VI-NEXT: v_readfirstlane_b32 s44, v1 @@ -32542,14 +32733,14 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: v_readfirstlane_b32 s10, v11 ; VI-NEXT: v_readfirstlane_b32 s9, v12 ; VI-NEXT: v_readfirstlane_b32 s8, v13 -; VI-NEXT: v_readfirstlane_b32 s6, v14 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_writelane_b32 v30, s39, 7 ; VI-NEXT: s_cbranch_scc0 .LBB45_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -32580,8 +32771,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: s_lshr_b32 s39, s16, 16 ; VI-NEXT: s_cbranch_execnz .LBB45_3 ; VI-NEXT: .LBB45_2: ; %cmp.true -; VI-NEXT: s_add_u32 s6, s6, 3 -; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s7, s7, 3 +; VI-NEXT: s_addc_u32 s6, s6, 0 ; VI-NEXT: s_add_u32 s9, s9, 3 ; VI-NEXT: s_addc_u32 s8, s8, 0 ; VI-NEXT: s_add_u32 s11, s11, 3 @@ -32610,8 +32801,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: s_addc_u32 s19, s19, 0 ; VI-NEXT: s_add_u32 s16, s16, 3 ; VI-NEXT: s_addc_u32 s17, s17, 0 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_lshr_b32 s47, s7, 16 ; VI-NEXT: s_lshr_b32 s56, s8, 16 ; VI-NEXT: s_lshr_b32 s57, s9, 16 ; VI-NEXT: s_lshr_b32 s58, s10, 16 @@ -32725,12 +32916,12 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: s_and_b32 s8, 0xffff, s8 ; VI-NEXT: s_lshl_b32 s44, s56, 16 ; VI-NEXT: s_or_b32 s8, s8, s44 -; VI-NEXT: s_and_b32 s6, 0xffff, s6 -; VI-NEXT: s_lshl_b32 s44, s47, 16 -; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: s_and_b32 s7, 0xffff, s7 -; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_lshl_b32 s44, s47, 16 ; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s16 @@ -32759,8 +32950,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v25, s10 ; VI-NEXT: v_mov_b32_e32 v26, s9 ; VI-NEXT: v_mov_b32_e32 v27, s8 -; VI-NEXT: v_mov_b32_e32 v28, s6 -; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_mov_b32_e32 v28, s7 +; VI-NEXT: v_mov_b32_e32 v29, s6 ; VI-NEXT: v_readlane_b32 s39, v30, 7 ; VI-NEXT: v_readlane_b32 s38, v30, 6 ; VI-NEXT: v_readlane_b32 s37, v30, 5 @@ -32805,7 +32996,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB45_2 +; VI-NEXT: s_branch .LBB45_3 ; ; GFX9-LABEL: bitcast_v15i64_to_v60f16_scalar: ; GFX9: ; %bb.0: @@ -32814,45 +33007,46 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v30, s30, 0 -; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_writelane_b32 v30, s34, 2 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: v_readfirstlane_b32 s10, v4 -; GFX9-NEXT: v_readfirstlane_b32 s11, v5 -; GFX9-NEXT: v_readfirstlane_b32 s12, v6 -; GFX9-NEXT: v_readfirstlane_b32 s13, v7 -; GFX9-NEXT: v_readfirstlane_b32 s14, v8 -; GFX9-NEXT: v_readfirstlane_b32 s15, v9 -; GFX9-NEXT: v_readfirstlane_b32 s40, v10 -; GFX9-NEXT: v_readfirstlane_b32 s41, v11 -; GFX9-NEXT: v_readfirstlane_b32 s42, v12 -; GFX9-NEXT: v_readfirstlane_b32 s43, v13 -; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v3 +; GFX9-NEXT: v_readfirstlane_b32 s11, v4 +; GFX9-NEXT: v_readfirstlane_b32 s12, v5 +; GFX9-NEXT: v_readfirstlane_b32 s13, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s40, v9 +; GFX9-NEXT: v_readfirstlane_b32 s41, v10 +; GFX9-NEXT: v_readfirstlane_b32 s42, v11 +; GFX9-NEXT: v_readfirstlane_b32 s43, v12 +; GFX9-NEXT: v_readfirstlane_b32 s44, v13 +; GFX9-NEXT: v_readfirstlane_b32 s45, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_writelane_b32 v30, s35, 3 ; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -32869,22 +33063,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s35, s16, 16 ; GFX9-NEXT: s_cbranch_execnz .LBB45_3 ; GFX9-NEXT: .LBB45_2: ; %cmp.true -; GFX9-NEXT: s_add_u32 s44, s44, 3 -; GFX9-NEXT: s_addc_u32 s45, s45, 0 -; GFX9-NEXT: s_add_u32 s42, s42, 3 -; GFX9-NEXT: s_addc_u32 s43, s43, 0 -; GFX9-NEXT: s_add_u32 s40, s40, 3 -; GFX9-NEXT: s_addc_u32 s41, s41, 0 -; GFX9-NEXT: s_add_u32 s14, s14, 3 -; GFX9-NEXT: s_addc_u32 s15, s15, 0 -; GFX9-NEXT: s_add_u32 s12, s12, 3 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_add_u32 s10, s10, 3 -; GFX9-NEXT: s_addc_u32 s11, s11, 0 -; GFX9-NEXT: s_add_u32 s8, s8, 3 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-NEXT: s_add_u32 s6, s6, 3 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s45, s45, 3 +; GFX9-NEXT: s_addc_u32 s6, s6, 0 +; GFX9-NEXT: s_add_u32 s43, s43, 3 +; GFX9-NEXT: s_addc_u32 s44, s44, 0 +; GFX9-NEXT: s_add_u32 s41, s41, 3 +; GFX9-NEXT: s_addc_u32 s42, s42, 0 +; GFX9-NEXT: s_add_u32 s15, s15, 3 +; GFX9-NEXT: s_addc_u32 s40, s40, 0 +; GFX9-NEXT: s_add_u32 s13, s13, 3 +; GFX9-NEXT: s_addc_u32 s14, s14, 0 +; GFX9-NEXT: s_add_u32 s11, s11, 3 +; GFX9-NEXT: s_addc_u32 s12, s12, 0 +; GFX9-NEXT: s_add_u32 s9, s9, 3 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s7, s7, 3 +; GFX9-NEXT: s_addc_u32 s8, s8, 0 ; GFX9-NEXT: s_add_u32 s28, s28, 3 ; GFX9-NEXT: s_addc_u32 s29, s29, 0 ; GFX9-NEXT: s_add_u32 s26, s26, 3 @@ -32899,22 +33093,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_addc_u32 s19, s19, 0 ; GFX9-NEXT: s_add_u32 s16, s16, 3 ; GFX9-NEXT: s_addc_u32 s17, s17, 0 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: s_lshr_b32 s47, s44, 16 -; GFX9-NEXT: s_lshr_b32 s56, s43, 16 -; GFX9-NEXT: s_lshr_b32 s57, s42, 16 -; GFX9-NEXT: s_lshr_b32 s58, s41, 16 -; GFX9-NEXT: s_lshr_b32 s59, s40, 16 -; GFX9-NEXT: s_lshr_b32 s60, s15, 16 -; GFX9-NEXT: s_lshr_b32 s61, s14, 16 -; GFX9-NEXT: s_lshr_b32 s62, s13, 16 -; GFX9-NEXT: s_lshr_b32 s63, s12, 16 -; GFX9-NEXT: s_lshr_b32 s72, s11, 16 -; GFX9-NEXT: s_lshr_b32 s73, s10, 16 -; GFX9-NEXT: s_lshr_b32 s74, s9, 16 -; GFX9-NEXT: s_lshr_b32 s75, s8, 16 -; GFX9-NEXT: s_lshr_b32 s76, s7, 16 -; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_lshr_b32 s47, s45, 16 +; GFX9-NEXT: s_lshr_b32 s56, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s43, 16 +; GFX9-NEXT: s_lshr_b32 s58, s42, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 16 +; GFX9-NEXT: s_lshr_b32 s60, s40, 16 +; GFX9-NEXT: s_lshr_b32 s61, s15, 16 +; GFX9-NEXT: s_lshr_b32 s62, s14, 16 +; GFX9-NEXT: s_lshr_b32 s63, s13, 16 +; GFX9-NEXT: s_lshr_b32 s72, s12, 16 +; GFX9-NEXT: s_lshr_b32 s73, s11, 16 +; GFX9-NEXT: s_lshr_b32 s74, s10, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_lshr_b32 s76, s8, 16 +; GFX9-NEXT: s_lshr_b32 s77, s7, 16 ; GFX9-NEXT: s_lshr_b32 s78, s29, 16 ; GFX9-NEXT: s_lshr_b32 s79, s28, 16 ; GFX9-NEXT: s_lshr_b32 s88, s27, 16 @@ -32944,22 +33138,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 ; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 ; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 -; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 -; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 -; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 -; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 -; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 -; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 @@ -32974,22 +33168,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v11, s25 ; GFX9-NEXT: v_mov_b32_e32 v12, s26 ; GFX9-NEXT: v_mov_b32_e32 v13, s27 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v15, s7 -; GFX9-NEXT: v_mov_b32_e32 v16, s8 -; GFX9-NEXT: v_mov_b32_e32 v17, s9 -; GFX9-NEXT: v_mov_b32_e32 v18, s10 -; GFX9-NEXT: v_mov_b32_e32 v19, s11 -; GFX9-NEXT: v_mov_b32_e32 v20, s12 -; GFX9-NEXT: v_mov_b32_e32 v21, s13 -; GFX9-NEXT: v_mov_b32_e32 v22, s14 -; GFX9-NEXT: v_mov_b32_e32 v23, s15 -; GFX9-NEXT: v_mov_b32_e32 v24, s28 -; GFX9-NEXT: v_mov_b32_e32 v25, s29 -; GFX9-NEXT: v_mov_b32_e32 v26, s40 -; GFX9-NEXT: v_mov_b32_e32 v27, s41 -; GFX9-NEXT: v_mov_b32_e32 v28, s42 -; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, s7 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v16, s9 +; GFX9-NEXT: v_mov_b32_e32 v17, s10 +; GFX9-NEXT: v_mov_b32_e32 v18, s11 +; GFX9-NEXT: v_mov_b32_e32 v19, s12 +; GFX9-NEXT: v_mov_b32_e32 v20, s13 +; GFX9-NEXT: v_mov_b32_e32 v21, s14 +; GFX9-NEXT: v_mov_b32_e32 v22, s15 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v24, s29 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s41 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s43 +; GFX9-NEXT: v_mov_b32_e32 v29, s6 ; GFX9-NEXT: v_readlane_b32 s35, v30, 3 ; GFX9-NEXT: v_readlane_b32 s34, v30, 2 ; GFX9-NEXT: v_readlane_b32 s31, v30, 1 @@ -33030,7 +33224,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: ; implicit-def: $sgpr47 ; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB45_2 +; GFX9-NEXT: s_branch .LBB45_3 ; ; GFX11-LABEL: bitcast_v15i64_to_v60f16_scalar: ; GFX11: ; %bb.0: @@ -33045,16 +33241,16 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: v_readfirstlane_b32 s10, v6 ; GFX11-NEXT: v_readfirstlane_b32 s11, v7 ; GFX11-NEXT: v_readfirstlane_b32 s12, v8 -; GFX11-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-NEXT: v_readfirstlane_b32 s14, v9 ; GFX11-NEXT: v_readfirstlane_b32 s15, v10 -; GFX11-NEXT: v_readfirstlane_b32 s14, v11 -; GFX11-NEXT: s_mov_b32 s94, 0 +; GFX11-NEXT: v_readfirstlane_b32 s13, v11 +; GFX11-NEXT: s_mov_b32 s94, -1 ; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -33082,13 +33278,12 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: s_lshr_b32 s91, s2, 16 ; GFX11-NEXT: s_lshr_b32 s92, s1, 16 ; GFX11-NEXT: s_lshr_b32 s93, s0, 16 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 -; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-NEXT: s_cbranch_execnz .LBB45_3 ; GFX11-NEXT: .LBB45_2: ; %cmp.true ; GFX11-NEXT: s_add_u32 s15, s15, 3 -; GFX11-NEXT: s_addc_u32 s14, s14, 0 -; GFX11-NEXT: s_add_u32 s12, s12, 3 ; GFX11-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-NEXT: s_add_u32 s12, s12, 3 +; GFX11-NEXT: s_addc_u32 s14, s14, 0 ; GFX11-NEXT: s_add_u32 s10, s10, 3 ; GFX11-NEXT: s_addc_u32 s11, s11, 0 ; GFX11-NEXT: s_add_u32 s8, s8, 3 @@ -33115,9 +33310,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-NEXT: s_lshr_b32 s40, s13, 16 ; GFX11-NEXT: s_lshr_b32 s41, s15, 16 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_lshr_b32 s43, s12, 16 ; GFX11-NEXT: s_lshr_b32 s44, s11, 16 ; GFX11-NEXT: s_lshr_b32 s45, s10, 16 @@ -33174,9 +33369,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45 ; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44 ; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42 ; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41 -; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 @@ -33190,8 +33385,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 ; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 ; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 -; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 -; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14 +; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB45_4: ; GFX11-NEXT: ; implicit-def: $sgpr93 @@ -33224,7 +33419,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr41 ; GFX11-NEXT: ; implicit-def: $sgpr40 -; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-NEXT: s_cbranch_vccz .LBB45_2 +; GFX11-NEXT: s_branch .LBB45_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -34835,11 +35032,11 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 @@ -34853,83 +35050,92 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -34938,260 +35144,240 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_or_b32_e32 v8, v49, v8 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v45, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_mov_b32_e32 v35, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB47_3 ; SI-NEXT: .LBB47_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v52, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v43, v62 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: .LBB47_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -35199,42 +35385,48 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -35247,65 +35439,65 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -35317,14 +35509,14 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -35332,9 +35524,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -35344,7 +35536,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -35356,32 +35548,39 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -35466,6 +35665,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -35482,7 +35682,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB47_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -35686,11 +35886,28 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB47_2 +; VI-NEXT: s_branch .LBB47_3 ; ; GFX9-LABEL: bitcast_v60f16_to_v15i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -35707,21 +35924,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -35742,7 +35945,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -35757,6 +35959,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -35904,7 +36107,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB47_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB47_2 +; GFX9-NEXT: s_branch .LBB47_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -35949,41 +36154,41 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -35998,17 +36203,16 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -36022,24 +36226,24 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -36056,7 +36260,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB47_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB47_2 +; GFX11-TRUE16-NEXT: s_branch .LBB47_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -36089,41 +36295,41 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -36138,17 +36344,16 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB47_3 ; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -36162,24 +36367,24 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -36196,7 +36401,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB47_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB47_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB47_2 +; GFX11-FAKE16-NEXT: s_branch .LBB47_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -37142,6 +37349,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v27, s16 ; SI-NEXT: v_mov_b32_e32 v28, s17 ; SI-NEXT: v_mov_b32_e32 v29, s18 @@ -37154,9 +37362,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v22, s25 ; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -37487,12 +37695,15 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB49_2 +; SI-NEXT: s_branch .LBB49_3 ; ; VI-LABEL: bitcast_v15f64_to_v60i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v17, s16 ; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_mov_b32_e32 v29, s18 @@ -37505,9 +37716,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v24, s25 ; VI-NEXT: v_mov_b32_e32 v21, s26 ; VI-NEXT: v_mov_b32_e32 v22, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v19, s28 ; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -37721,12 +37932,15 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB49_2 +; VI-NEXT: s_branch .LBB49_3 ; ; GFX9-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v17, s16 ; GFX9-NEXT: v_mov_b32_e32 v18, s17 ; GFX9-NEXT: v_mov_b32_e32 v29, s18 @@ -37739,9 +37953,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v24, s25 ; GFX9-NEXT: v_mov_b32_e32 v21, s26 ; GFX9-NEXT: v_mov_b32_e32 v22, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -37955,7 +38169,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB49_2 +; GFX9-NEXT: s_branch .LBB49_3 ; ; GFX11-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX11: ; %bb.0: @@ -37970,8 +38186,8 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25 ; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 @@ -38004,8 +38220,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28 ; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-NEXT: s_cbranch_execnz .LBB49_3 ; GFX11-NEXT: .LBB49_2: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -38154,7 +38369,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX11-NEXT: ; implicit-def: $vgpr52 ; GFX11-NEXT: ; implicit-def: $vgpr51 ; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB49_2 +; GFX11-NEXT: s_branch .LBB49_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -39572,6 +39789,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v35, v22 ; SI-NEXT: v_mov_b32_e32 v36, v20 ; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 @@ -39603,7 +39821,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 @@ -39906,7 +40124,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: s_branch .LBB51_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccz .LBB51_2 +; SI-NEXT: s_branch .LBB51_3 ; ; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: ; VI: ; %bb.0: @@ -39926,6 +40146,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -39942,7 +40163,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -40189,11 +40410,28 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB51_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB51_2 +; VI-NEXT: s_branch .LBB51_3 ; ; GFX9-LABEL: bitcast_v60i16_to_v15f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -40210,21 +40448,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -40245,7 +40469,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -40260,6 +40483,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -40405,7 +40629,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB51_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB51_2 +; GFX9-NEXT: s_branch .LBB51_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -40450,41 +40676,41 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -40499,17 +40725,16 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -40523,24 +40748,24 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -40557,7 +40782,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB51_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB51_2 +; GFX11-TRUE16-NEXT: s_branch .LBB51_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -40590,41 +40817,41 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -40639,17 +40866,16 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_3 ; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -40663,24 +40889,24 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -40697,7 +40923,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB51_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB51_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB51_2 +; GFX11-FAKE16-NEXT: s_branch .LBB51_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -41991,6 +42219,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: s_and_b64 s[44:45], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s42, v1 ; SI-NEXT: v_readfirstlane_b32 s43, v2 ; SI-NEXT: v_readfirstlane_b32 s40, v3 @@ -42006,8 +42235,8 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_readfirstlane_b32 s6, v13 ; SI-NEXT: v_readfirstlane_b32 s7, v14 ; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: s_and_b64 s[44:45], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: s_mov_b64 s[44:45], -1 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -42542,12 +42771,15 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: s_branch .LBB53_2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[44:45] +; SI-NEXT: s_cbranch_vccz .LBB53_2 +; SI-NEXT: s_branch .LBB53_3 ; ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v17, s16 ; VI-NEXT: v_mov_b32_e32 v18, s17 ; VI-NEXT: v_mov_b32_e32 v29, s18 @@ -42560,9 +42792,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: v_mov_b32_e32 v24, s25 ; VI-NEXT: v_mov_b32_e32 v21, s26 ; VI-NEXT: v_mov_b32_e32 v22, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v19, s28 ; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -42776,12 +43008,15 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: s_branch .LBB53_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB53_2 +; VI-NEXT: s_branch .LBB53_3 ; ; GFX9-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v17, s16 ; GFX9-NEXT: v_mov_b32_e32 v18, s17 ; GFX9-NEXT: v_mov_b32_e32 v29, s18 @@ -42794,9 +43029,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v24, s25 ; GFX9-NEXT: v_mov_b32_e32 v21, s26 ; GFX9-NEXT: v_mov_b32_e32 v22, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -43010,7 +43245,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB53_2 +; GFX9-NEXT: s_branch .LBB53_3 ; ; GFX11-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX11: ; %bb.0: @@ -43025,8 +43262,8 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25 ; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 @@ -43059,8 +43296,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28 ; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31 ; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: s_cbranch_execnz .LBB53_3 ; GFX11-NEXT: .LBB53_2: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -43209,7 +43445,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX11-NEXT: ; implicit-def: $vgpr52 ; GFX11-NEXT: ; implicit-def: $vgpr51 ; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB53_2 +; GFX11-NEXT: s_branch .LBB53_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -44820,11 +45058,11 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 @@ -44838,83 +45076,92 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -44923,260 +45170,240 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v22, v18 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 -; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_or_b32_e32 v8, v49, v8 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v45, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 -; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v16, v53, v16 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_mov_b32_e32 v35, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 -; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB55_3 ; SI-NEXT: .LBB55_2: -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v52, v12 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: v_mov_b32_e32 v57, v48 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v46, v37 ; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v44, v34 ; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v43, v62 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: .LBB55_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB55_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -45184,42 +45411,48 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -45232,65 +45465,65 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v44 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 @@ -45302,14 +45535,14 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 @@ -45317,9 +45550,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -45329,7 +45562,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 @@ -45341,32 +45574,39 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -45451,6 +45691,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_mov_b32_e32 v33, v14 ; VI-NEXT: v_mov_b32_e32 v34, v13 @@ -45467,7 +45708,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: v_mov_b32_e32 v53, v2 ; VI-NEXT: v_mov_b32_e32 v54, v1 ; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: s_cbranch_scc0 .LBB55_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v0, 16 @@ -45671,11 +45912,28 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB55_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccz .LBB55_2 +; VI-NEXT: s_branch .LBB55_3 ; ; GFX9-LABEL: bitcast_v60f16_to_v15f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 ; GFX9-NEXT: v_mov_b32_e32 v32, v15 ; GFX9-NEXT: v_mov_b32_e32 v33, v14 ; GFX9-NEXT: v_mov_b32_e32 v34, v13 @@ -45692,21 +45950,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v53, v2 ; GFX9-NEXT: v_mov_b32_e32 v54, v1 ; GFX9-NEXT: v_mov_b32_e32 v55, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_lshr_b32 s40, s29, 16 -; GFX9-NEXT: s_lshr_b32 s41, s28, 16 -; GFX9-NEXT: s_lshr_b32 s42, s27, 16 -; GFX9-NEXT: s_lshr_b32 s43, s26, 16 -; GFX9-NEXT: s_lshr_b32 s15, s25, 16 -; GFX9-NEXT: s_lshr_b32 s14, s24, 16 -; GFX9-NEXT: s_lshr_b32 s13, s23, 16 -; GFX9-NEXT: s_lshr_b32 s12, s22, 16 -; GFX9-NEXT: s_lshr_b32 s11, s21, 16 -; GFX9-NEXT: s_lshr_b32 s10, s20, 16 -; GFX9-NEXT: s_lshr_b32 s9, s19, 16 -; GFX9-NEXT: s_lshr_b32 s8, s18, 16 -; GFX9-NEXT: s_lshr_b32 s7, s17, 16 -; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -45727,7 +45971,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 @@ -45742,6 +45985,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 ; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 ; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 @@ -45889,7 +46133,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB55_4: ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccz .LBB55_2 +; GFX9-NEXT: s_branch .LBB55_3 ; ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar: ; GFX11-TRUE16: ; %bb.0: @@ -45934,41 +46180,41 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -45983,17 +46229,16 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -46007,24 +46252,24 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -46041,7 +46286,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB55_4: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB55_2 +; GFX11-TRUE16-NEXT: s_branch .LBB55_3 ; ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64_scalar: ; GFX11-FAKE16: ; %bb.0: @@ -46074,41 +46321,41 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 @@ -46123,17 +46370,16 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 -; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 ; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 @@ -46147,24 +46393,24 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -46181,7 +46427,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB55_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_branch .LBB55_2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB55_2 +; GFX11-FAKE16-NEXT: s_branch .LBB55_3 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -47867,9 +48115,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB57_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 @@ -47917,86 +48166,87 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_mov_b32_e32 v36, v35 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 +; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_mov_b32_e32 v34, v33 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 +; SI-NEXT: v_mov_b32_e32 v32, v50 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -48029,27 +48279,27 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v36, v35 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v35, v34 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v34, v33 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v50 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -48151,6 +48401,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v36 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 ; SI-NEXT: s_add_i32 s17, s17, 3 @@ -48193,50 +48444,49 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 ; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 ; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -48746,11 +48996,12 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; VI-NEXT: s_lshr_b32 s41, s18, 16 ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 @@ -48763,10 +49014,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: s_cbranch_scc0 .LBB57_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB57_3 -; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB57_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB57_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_add_i32 s43, s43, 3 ; VI-NEXT: s_add_i32 s17, s17, 3 @@ -48827,7 +49081,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: .LBB57_4: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: s_and_b32 s4, 0xffff, s16 ; VI-NEXT: s_lshl_b32 s5, s43, 16 @@ -48919,8 +49173,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v14, v30 ; VI-NEXT: v_mov_b32_e32 v15, v31 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB57_4: -; VI-NEXT: s_branch .LBB57_2 ; ; GFX9-LABEL: bitcast_v60i16_to_v60f16_scalar: ; GFX9: ; %bb.0: @@ -48940,11 +49192,12 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 @@ -48969,10 +49222,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB57_4 -; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB57_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 ; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 @@ -49080,8 +49336,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; GFX9-NEXT: s_branch .LBB57_5 -; GFX9-NEXT: .LBB57_3: -; GFX9-NEXT: s_branch .LBB57_2 ; GFX9-NEXT: .LBB57_4: ; GFX9-NEXT: v_mov_b32_e32 v30, s29 ; GFX9-NEXT: v_mov_b32_e32 v31, s28 @@ -49249,13 +49503,16 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s46, -1 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: .LBB57_2: ; %Flow +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 -; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 @@ -49359,8 +49616,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 ; GFX11-TRUE16-NEXT: s_branch .LBB57_5 -; GFX11-TRUE16-NEXT: .LBB57_3: -; GFX11-TRUE16-NEXT: s_branch .LBB57_2 ; GFX11-TRUE16-NEXT: .LBB57_4: ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 @@ -49480,19 +49735,22 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s46, -1 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: .LBB57_2: ; %Flow +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 -; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 @@ -49529,10 +49787,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -49559,12 +49817,12 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s12, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s11, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v49, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v49, s8, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v48, s0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38 @@ -49596,8 +49854,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 ; GFX11-FAKE16-NEXT: s_branch .LBB57_5 -; GFX11-FAKE16-NEXT: .LBB57_3: -; GFX11-FAKE16-NEXT: s_branch .LBB57_2 ; GFX11-FAKE16-NEXT: .LBB57_4: ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 @@ -49614,8 +49870,8 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s9 :: v_dual_mov_b32 v81, s7 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5 ; GFX11-FAKE16-NEXT: .LBB57_5: ; %end ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 @@ -51066,13 +51322,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v58, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v57, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: s_cbranch_scc0 .LBB59_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB59_3 -; SI-NEXT: .LBB59_2: -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: .LBB59_3: ; %Flow +; SI-NEXT: .LBB59_2: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v14 ; SI-NEXT: v_mov_b32_e32 v63, v15 @@ -51084,8 +51338,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v8, v5 ; SI-NEXT: v_mov_b32_e32 v5, v42 ; SI-NEXT: v_mov_b32_e32 v42, v1 -; SI-NEXT: s_cbranch_vccnz .LBB59_5 -; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_cbranch_vccnz .LBB59_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 @@ -51348,7 +51602,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v19, v3, v19, 16 ; SI-NEXT: v_alignbit_b32 v36, v62, v14, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: .LBB59_4: ; %end ; SI-NEXT: v_and_b32_e32 v39, 0xffff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v60 ; SI-NEXT: v_or_b32_e32 v39, v39, v50 @@ -51562,11 +51816,12 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s41, s18, 16 ; VI-NEXT: s_lshr_b32 s42, s17, 16 ; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 @@ -51591,10 +51846,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: s_cbranch_scc0 .LBB59_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_cbranch_execnz .LBB59_4 -; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_2: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB59_4 +; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v54, 0x200 ; VI-NEXT: v_add_f16_e32 v32, s16, v54 ; VI-NEXT: v_add_f16_e32 v59, s43, v54 @@ -51657,8 +51915,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 ; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 ; VI-NEXT: s_branch .LBB59_5 -; VI-NEXT: .LBB59_3: -; VI-NEXT: s_branch .LBB59_2 ; VI-NEXT: .LBB59_4: ; VI-NEXT: v_mov_b32_e32 v54, s6 ; VI-NEXT: v_mov_b32_e32 v53, s29 @@ -51798,11 +52054,12 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s8, s18, 16 ; GFX9-NEXT: s_lshr_b32 s7, s17, 16 ; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 @@ -51827,10 +52084,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_cbranch_execnz .LBB59_4 -; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_2: ; %Flow +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 @@ -51940,8 +52200,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; GFX9-NEXT: s_branch .LBB59_5 -; GFX9-NEXT: .LBB59_3: -; GFX9-NEXT: s_branch .LBB59_2 ; GFX9-NEXT: .LBB59_4: ; GFX9-NEXT: v_mov_b32_e32 v30, s29 ; GFX9-NEXT: v_mov_b32_e32 v31, s28 @@ -52109,13 +52367,16 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s46, -1 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 -; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo -; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: .LBB59_2: ; %Flow +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 -; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 @@ -52219,8 +52480,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 ; GFX11-TRUE16-NEXT: s_branch .LBB59_5 -; GFX11-TRUE16-NEXT: .LBB59_3: -; GFX11-TRUE16-NEXT: s_branch .LBB59_2 ; GFX11-TRUE16-NEXT: .LBB59_4: ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 @@ -52340,19 +52599,22 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s2, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 ; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s46, -1 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 -; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo -; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: .LBB59_2: ; %Flow +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 -; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 @@ -52389,10 +52651,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] @@ -52419,12 +52681,12 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s12 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s11 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v49, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v49, 0x200, s8 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v48, 0x200, s0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38 @@ -52456,8 +52718,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 ; GFX11-FAKE16-NEXT: s_branch .LBB59_5 -; GFX11-FAKE16-NEXT: .LBB59_3: -; GFX11-FAKE16-NEXT: s_branch .LBB59_2 ; GFX11-FAKE16-NEXT: .LBB59_4: ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 @@ -52474,8 +52734,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s9 :: v_dual_mov_b32 v81, s7 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5 ; GFX11-FAKE16-NEXT: .LBB59_5: ; %end ; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 |
