summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll6978
1 files changed, 3619 insertions, 3359 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 4f4687507680..6e60051bd996 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -201,6 +201,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -218,7 +219,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -232,10 +233,13 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB1_4
+; SI-NEXT: s_cbranch_scc0 .LBB1_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB1_3
-; SI-NEXT: .LBB1_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB1_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB1_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
@@ -266,16 +270,15 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
-; SI-NEXT: .LBB1_3: ; %end
+; SI-NEXT: .LBB1_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB1_4:
-; SI-NEXT: s_branch .LBB1_2
;
; VI-LABEL: bitcast_v30i32_to_v30f32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -293,7 +296,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -307,10 +310,13 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB1_4
+; VI-NEXT: s_cbranch_scc0 .LBB1_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB1_3
-; VI-NEXT: .LBB1_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB1_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB1_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29
; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28
; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27
@@ -341,16 +347,15 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: .LBB1_3: ; %end
+; VI-NEXT: .LBB1_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB1_4:
-; VI-NEXT: s_branch .LBB1_2
;
; GFX9-LABEL: bitcast_v30i32_to_v30f32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -368,7 +373,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -382,10 +387,13 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB1_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB1_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB1_3
-; GFX9-NEXT: .LBB1_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB1_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB1_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_u32_e32 v29, 3, v29
; GFX9-NEXT: v_add_u32_e32 v28, 3, v28
; GFX9-NEXT: v_add_u32_e32 v27, 3, v27
@@ -416,43 +424,41 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; GFX9-NEXT: v_add_u32_e32 v2, 3, v2
; GFX9-NEXT: v_add_u32_e32 v1, 3, v1
; GFX9-NEXT: v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT: .LBB1_3: ; %end
+; GFX9-NEXT: .LBB1_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB1_4:
-; GFX9-NEXT: s_branch .LBB1_2
;
; GFX11-LABEL: bitcast_v30i32_to_v30f32_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB1_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB1_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB1_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB1_3:
-; GFX11-NEXT: .LBB1_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB1_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29
; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28
; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27
@@ -483,6 +489,7 @@ define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a,
; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-NEXT: .LBB1_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -681,6 +688,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -698,7 +706,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -712,10 +720,13 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB3_4
+; SI-NEXT: s_cbranch_scc0 .LBB3_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB3_3
-; SI-NEXT: .LBB3_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB3_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB3_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
; SI-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -746,16 +757,15 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
-; SI-NEXT: .LBB3_3: ; %end
+; SI-NEXT: .LBB3_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB3_4:
-; SI-NEXT: s_branch .LBB3_2
;
; VI-LABEL: bitcast_v30f32_to_v30i32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -773,7 +783,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -787,10 +797,13 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB3_4
+; VI-NEXT: s_cbranch_scc0 .LBB3_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB3_3
-; VI-NEXT: .LBB3_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB3_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB3_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f32_e32 v29, 1.0, v29
; VI-NEXT: v_add_f32_e32 v28, 1.0, v28
; VI-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -821,16 +834,15 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; VI-NEXT: v_add_f32_e32 v2, 1.0, v2
; VI-NEXT: v_add_f32_e32 v1, 1.0, v1
; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
-; VI-NEXT: .LBB3_3: ; %end
+; VI-NEXT: .LBB3_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB3_4:
-; VI-NEXT: s_branch .LBB3_2
;
; GFX9-LABEL: bitcast_v30f32_to_v30i32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -848,7 +860,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -862,10 +874,13 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB3_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB3_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB3_3
-; GFX9-NEXT: .LBB3_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB3_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB3_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29
; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28
; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -896,43 +911,41 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT: .LBB3_3: ; %end
+; GFX9-NEXT: .LBB3_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB3_4:
-; GFX9-NEXT: s_branch .LBB3_2
;
; GFX11-LABEL: bitcast_v30f32_to_v30i32_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB3_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB3_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB3_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB3_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB3_3:
-; GFX11-NEXT: .LBB3_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB3_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
@@ -948,6 +961,7 @@ define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT: .LBB3_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -1161,6 +1175,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -1178,7 +1193,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -1192,10 +1207,13 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB5_4
+; SI-NEXT: s_cbranch_scc0 .LBB5_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB5_3
-; SI-NEXT: .LBB5_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB5_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB5_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
@@ -1226,16 +1244,15 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
-; SI-NEXT: .LBB5_3: ; %end
+; SI-NEXT: .LBB5_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB5_4:
-; SI-NEXT: s_branch .LBB5_2
;
; VI-LABEL: bitcast_v30i32_to_v15i64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -1253,7 +1270,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -1267,10 +1284,13 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB5_4
+; VI-NEXT: s_cbranch_scc0 .LBB5_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB5_3
-; VI-NEXT: .LBB5_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB5_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB5_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29
; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28
; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27
@@ -1301,16 +1321,15 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: .LBB5_3: ; %end
+; VI-NEXT: .LBB5_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB5_4:
-; VI-NEXT: s_branch .LBB5_2
;
; GFX9-LABEL: bitcast_v30i32_to_v15i64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -1328,7 +1347,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -1342,10 +1361,13 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB5_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB5_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB5_3
-; GFX9-NEXT: .LBB5_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB5_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB5_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_u32_e32 v29, 3, v29
; GFX9-NEXT: v_add_u32_e32 v28, 3, v28
; GFX9-NEXT: v_add_u32_e32 v27, 3, v27
@@ -1376,43 +1398,41 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_add_u32_e32 v2, 3, v2
; GFX9-NEXT: v_add_u32_e32 v1, 3, v1
; GFX9-NEXT: v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT: .LBB5_3: ; %end
+; GFX9-NEXT: .LBB5_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB5_4:
-; GFX9-NEXT: s_branch .LBB5_2
;
; GFX11-LABEL: bitcast_v30i32_to_v15i64_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB5_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB5_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB5_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB5_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB5_3:
-; GFX11-NEXT: .LBB5_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB5_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29
; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28
; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27
@@ -1443,6 +1463,7 @@ define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i3
; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-NEXT: .LBB5_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -1664,6 +1685,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -1681,7 +1703,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -1695,10 +1717,13 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB7_4
+; SI-NEXT: s_cbranch_scc0 .LBB7_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB7_3
-; SI-NEXT: .LBB7_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB7_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB7_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
@@ -1729,16 +1754,15 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; SI-NEXT: .LBB7_3: ; %end
+; SI-NEXT: .LBB7_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB7_4:
-; SI-NEXT: s_branch .LBB7_2
;
; VI-LABEL: bitcast_v15i64_to_v30i32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -1756,7 +1780,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -1770,10 +1794,13 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB7_4
+; VI-NEXT: s_cbranch_scc0 .LBB7_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB7_3
-; VI-NEXT: .LBB7_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB7_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB7_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28
; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26
@@ -1804,16 +1831,15 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: .LBB7_3: ; %end
+; VI-NEXT: .LBB7_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB7_4:
-; VI-NEXT: s_branch .LBB7_2
;
; GFX9-LABEL: bitcast_v15i64_to_v30i32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -1831,7 +1857,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -1845,10 +1871,13 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB7_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB7_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB7_3
-; GFX9-NEXT: .LBB7_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB7_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB7_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28
; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc
; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26
@@ -1879,43 +1908,41 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: .LBB7_3: ; %end
+; GFX9-NEXT: .LBB7_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB7_4:
-; GFX9-NEXT: s_branch .LBB7_2
;
; GFX11-LABEL: bitcast_v15i64_to_v30i32_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB7_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB7_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB7_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB7_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB7_3:
-; GFX11-NEXT: .LBB7_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB7_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
@@ -1954,6 +1981,7 @@ define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i3
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT: .LBB7_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2167,6 +2195,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -2184,7 +2213,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -2198,10 +2227,13 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB9_4
+; SI-NEXT: s_cbranch_scc0 .LBB9_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB9_3
-; SI-NEXT: .LBB9_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB9_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB9_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27
@@ -2232,16 +2264,15 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
-; SI-NEXT: .LBB9_3: ; %end
+; SI-NEXT: .LBB9_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB9_4:
-; SI-NEXT: s_branch .LBB9_2
;
; VI-LABEL: bitcast_v30i32_to_v15f64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -2259,7 +2290,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -2273,10 +2304,13 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB9_4
+; VI-NEXT: s_cbranch_scc0 .LBB9_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB9_3
-; VI-NEXT: .LBB9_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB9_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB9_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29
; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28
; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27
@@ -2307,16 +2341,15 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT: .LBB9_3: ; %end
+; VI-NEXT: .LBB9_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB9_4:
-; VI-NEXT: s_branch .LBB9_2
;
; GFX9-LABEL: bitcast_v30i32_to_v15f64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -2334,7 +2367,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -2348,10 +2381,13 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB9_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB9_3
-; GFX9-NEXT: .LBB9_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB9_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB9_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_u32_e32 v29, 3, v29
; GFX9-NEXT: v_add_u32_e32 v28, 3, v28
; GFX9-NEXT: v_add_u32_e32 v27, 3, v27
@@ -2382,43 +2418,41 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; GFX9-NEXT: v_add_u32_e32 v2, 3, v2
; GFX9-NEXT: v_add_u32_e32 v1, 3, v1
; GFX9-NEXT: v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT: .LBB9_3: ; %end
+; GFX9-NEXT: .LBB9_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB9_4:
-; GFX9-NEXT: s_branch .LBB9_2
;
; GFX11-LABEL: bitcast_v30i32_to_v15f64_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB9_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB9_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB9_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB9_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB9_3:
-; GFX11-NEXT: .LBB9_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB9_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29
; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28
; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27
@@ -2449,6 +2483,7 @@ define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a,
; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-NEXT: .LBB9_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2602,6 +2637,7 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -2630,13 +2666,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v9, s25
; SI-NEXT: v_mov_b32_e32 v10, s26
; SI-NEXT: v_mov_b32_e32 v11, s27
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB11_4
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: s_cbranch_scc0 .LBB11_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB11_3
-; SI-NEXT: .LBB11_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB11_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB11_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -2652,17 +2691,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; SI-NEXT: .LBB11_3: ; %end
+; SI-NEXT: .LBB11_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: v_mov_b32_e32 v17, v31
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB11_4:
-; SI-NEXT: s_branch .LBB11_2
;
; VI-LABEL: bitcast_v15f64_to_v30i32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -2691,13 +2729,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v9, s25
; VI-NEXT: v_mov_b32_e32 v10, s26
; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB11_4
+; VI-NEXT: s_mov_b64 s[4:5], -1
+; VI-NEXT: s_cbranch_scc0 .LBB11_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB11_3
-; VI-NEXT: .LBB11_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB11_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB11_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -2713,17 +2754,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; VI-NEXT: .LBB11_3: ; %end
+; VI-NEXT: .LBB11_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: v_mov_b32_e32 v17, v31
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB11_4:
-; VI-NEXT: s_branch .LBB11_2
;
; GFX9-LABEL: bitcast_v15f64_to_v30i32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -2752,13 +2792,16 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v9, s25
; GFX9-NEXT: v_mov_b32_e32 v10, s26
; GFX9-NEXT: v_mov_b32_e32 v11, s27
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB11_4
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
+; GFX9-NEXT: s_cbranch_scc0 .LBB11_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB11_3
-; GFX9-NEXT: .LBB11_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB11_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB11_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -2774,44 +2817,42 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT: .LBB11_3: ; %end
+; GFX9-NEXT: .LBB11_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: v_mov_b32_e32 v17, v31
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB11_4:
-; GFX9-NEXT: s_branch .LBB11_2
;
; GFX11-LABEL: bitcast_v15f64_to_v30i32_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB11_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB11_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB11_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB11_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB11_3:
-; GFX11-NEXT: .LBB11_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB11_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -2827,6 +2868,7 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a,
; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: .LBB11_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3848,6 +3890,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: v_readfirstlane_b32 s44, v2
; SI-NEXT: v_readfirstlane_b32 s43, v3
@@ -3863,8 +3906,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s9, v13
; SI-NEXT: v_readfirstlane_b32 s8, v14
; SI-NEXT: v_readfirstlane_b32 s7, v15
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s6, v16
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: s_cbranch_scc0 .LBB13_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_mov_b32_e32 v1, s7
@@ -4202,7 +4245,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr47
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: s_branch .LBB13_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB13_2
+; SI-NEXT: s_branch .LBB13_3
;
; VI-LABEL: bitcast_v30i32_to_v60i16_scalar:
; VI: ; %bb.0:
@@ -4215,8 +4260,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_writelane_b32 v30, s34, 2
; VI-NEXT: v_writelane_b32 v30, s35, 3
; VI-NEXT: v_writelane_b32 v30, s36, 4
-; VI-NEXT: v_writelane_b32 v30, s37, 5
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_writelane_b32 v30, s38, 6
; VI-NEXT: v_readfirstlane_b32 s45, v0
; VI-NEXT: v_readfirstlane_b32 s44, v1
@@ -4232,14 +4278,14 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: v_readfirstlane_b32 s9, v12
; VI-NEXT: v_readfirstlane_b32 s8, v13
-; VI-NEXT: v_readfirstlane_b32 s6, v14
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v15
+; VI-NEXT: v_readfirstlane_b32 s7, v14
+; VI-NEXT: v_readfirstlane_b32 s6, v15
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_writelane_b32 v30, s39, 7
; VI-NEXT: s_cbranch_scc0 .LBB13_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -4270,8 +4316,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: s_lshr_b32 s39, s16, 16
; VI-NEXT: s_cbranch_execnz .LBB13_3
; VI-NEXT: .LBB13_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s7, s7, 3
; VI-NEXT: s_add_i32 s6, s6, 3
+; VI-NEXT: s_add_i32 s7, s7, 3
; VI-NEXT: s_add_i32 s8, s8, 3
; VI-NEXT: s_add_i32 s9, s9, 3
; VI-NEXT: s_add_i32 s10, s10, 3
@@ -4300,8 +4346,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: s_add_i32 s18, s18, 3
; VI-NEXT: s_add_i32 s17, s17, 3
; VI-NEXT: s_add_i32 s16, s16, 3
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -4415,12 +4461,12 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: s_and_b32 s8, 0xffff, s8
; VI-NEXT: s_lshl_b32 s44, s56, 16
; VI-NEXT: s_or_b32 s8, s8, s44
-; VI-NEXT: s_and_b32 s6, 0xffff, s6
-; VI-NEXT: s_lshl_b32 s44, s47, 16
-; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: s_and_b32 s7, 0xffff, s7
-; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_lshl_b32 s44, s47, 16
; VI-NEXT: s_or_b32 s7, s7, s44
+; VI-NEXT: s_and_b32 s6, 0xffff, s6
+; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s16
@@ -4449,8 +4495,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v25, s10
; VI-NEXT: v_mov_b32_e32 v26, s9
; VI-NEXT: v_mov_b32_e32 v27, s8
-; VI-NEXT: v_mov_b32_e32 v28, s6
-; VI-NEXT: v_mov_b32_e32 v29, s7
+; VI-NEXT: v_mov_b32_e32 v28, s7
+; VI-NEXT: v_mov_b32_e32 v29, s6
; VI-NEXT: v_readlane_b32 s39, v30, 7
; VI-NEXT: v_readlane_b32 s38, v30, 6
; VI-NEXT: v_readlane_b32 s37, v30, 5
@@ -4495,7 +4541,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: ; implicit-def: $sgpr47
; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: s_branch .LBB13_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB13_2
+; VI-NEXT: s_branch .LBB13_3
;
; GFX9-LABEL: bitcast_v30i32_to_v60i16_scalar:
; GFX9: ; %bb.0:
@@ -4504,45 +4552,46 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s30, 0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s7, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9-NEXT: v_readfirstlane_b32 s10, v4
-; GFX9-NEXT: v_readfirstlane_b32 s11, v5
-; GFX9-NEXT: v_readfirstlane_b32 s12, v6
-; GFX9-NEXT: v_readfirstlane_b32 s13, v7
-; GFX9-NEXT: v_readfirstlane_b32 s14, v8
-; GFX9-NEXT: v_readfirstlane_b32 s15, v9
-; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_readfirstlane_b32 s41, v11
-; GFX9-NEXT: v_readfirstlane_b32 s42, v12
-; GFX9-NEXT: v_readfirstlane_b32 s43, v13
-; GFX9-NEXT: v_readfirstlane_b32 s44, v14
+; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s9, v2
+; GFX9-NEXT: v_readfirstlane_b32 s10, v3
+; GFX9-NEXT: v_readfirstlane_b32 s11, v4
+; GFX9-NEXT: v_readfirstlane_b32 s12, v5
+; GFX9-NEXT: v_readfirstlane_b32 s13, v6
+; GFX9-NEXT: v_readfirstlane_b32 s14, v7
+; GFX9-NEXT: v_readfirstlane_b32 s15, v8
+; GFX9-NEXT: v_readfirstlane_b32 s40, v9
+; GFX9-NEXT: v_readfirstlane_b32 s41, v10
+; GFX9-NEXT: v_readfirstlane_b32 s42, v11
+; GFX9-NEXT: v_readfirstlane_b32 s43, v12
+; GFX9-NEXT: v_readfirstlane_b32 s44, v13
+; GFX9-NEXT: v_readfirstlane_b32 s45, v14
+; GFX9-NEXT: v_readfirstlane_b32 s6, v15
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_writelane_b32 v30, s35, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB13_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -4559,6 +4608,7 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s35, s16, 16
; GFX9-NEXT: s_cbranch_execnz .LBB13_3
; GFX9-NEXT: .LBB13_2: ; %cmp.true
+; GFX9-NEXT: s_add_i32 s6, s6, 3
; GFX9-NEXT: s_add_i32 s45, s45, 3
; GFX9-NEXT: s_add_i32 s44, s44, 3
; GFX9-NEXT: s_add_i32 s43, s43, 3
@@ -4574,7 +4624,6 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: s_add_i32 s9, s9, 3
; GFX9-NEXT: s_add_i32 s8, s8, 3
; GFX9-NEXT: s_add_i32 s7, s7, 3
-; GFX9-NEXT: s_add_i32 s6, s6, 3
; GFX9-NEXT: s_add_i32 s29, s29, 3
; GFX9-NEXT: s_add_i32 s28, s28, 3
; GFX9-NEXT: s_add_i32 s27, s27, 3
@@ -4589,22 +4638,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: s_add_i32 s18, s18, 3
; GFX9-NEXT: s_add_i32 s17, s17, 3
; GFX9-NEXT: s_add_i32 s16, s16, 3
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -4634,22 +4683,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88
; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79
; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78
-; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77
-; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76
-; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75
-; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74
-; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73
-; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72
-; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63
-; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62
-; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61
-; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60
-; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59
-; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58
-; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57
-; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
-; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
-; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77
+; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76
+; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75
+; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74
+; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73
+; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72
+; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63
+; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62
+; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61
+; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60
+; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59
+; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58
+; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57
+; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56
+; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47
+; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s16
@@ -4664,22 +4713,22 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v11, s25
; GFX9-NEXT: v_mov_b32_e32 v12, s26
; GFX9-NEXT: v_mov_b32_e32 v13, s27
-; GFX9-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-NEXT: v_mov_b32_e32 v16, s8
-; GFX9-NEXT: v_mov_b32_e32 v17, s9
-; GFX9-NEXT: v_mov_b32_e32 v18, s10
-; GFX9-NEXT: v_mov_b32_e32 v19, s11
-; GFX9-NEXT: v_mov_b32_e32 v20, s12
-; GFX9-NEXT: v_mov_b32_e32 v21, s13
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
-; GFX9-NEXT: v_mov_b32_e32 v23, s15
-; GFX9-NEXT: v_mov_b32_e32 v24, s28
-; GFX9-NEXT: v_mov_b32_e32 v25, s29
-; GFX9-NEXT: v_mov_b32_e32 v26, s40
-; GFX9-NEXT: v_mov_b32_e32 v27, s41
-; GFX9-NEXT: v_mov_b32_e32 v28, s42
-; GFX9-NEXT: v_mov_b32_e32 v29, s43
+; GFX9-NEXT: v_mov_b32_e32 v14, s7
+; GFX9-NEXT: v_mov_b32_e32 v15, s8
+; GFX9-NEXT: v_mov_b32_e32 v16, s9
+; GFX9-NEXT: v_mov_b32_e32 v17, s10
+; GFX9-NEXT: v_mov_b32_e32 v18, s11
+; GFX9-NEXT: v_mov_b32_e32 v19, s12
+; GFX9-NEXT: v_mov_b32_e32 v20, s13
+; GFX9-NEXT: v_mov_b32_e32 v21, s14
+; GFX9-NEXT: v_mov_b32_e32 v22, s15
+; GFX9-NEXT: v_mov_b32_e32 v23, s28
+; GFX9-NEXT: v_mov_b32_e32 v24, s29
+; GFX9-NEXT: v_mov_b32_e32 v25, s40
+; GFX9-NEXT: v_mov_b32_e32 v26, s41
+; GFX9-NEXT: v_mov_b32_e32 v27, s42
+; GFX9-NEXT: v_mov_b32_e32 v28, s43
+; GFX9-NEXT: v_mov_b32_e32 v29, s6
; GFX9-NEXT: v_readlane_b32 s35, v30, 3
; GFX9-NEXT: v_readlane_b32 s34, v30, 2
; GFX9-NEXT: v_readlane_b32 s31, v30, 1
@@ -4720,7 +4769,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX9-NEXT: ; implicit-def: $sgpr56
; GFX9-NEXT: ; implicit-def: $sgpr47
; GFX9-NEXT: ; implicit-def: $sgpr46
-; GFX9-NEXT: s_branch .LBB13_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB13_2
+; GFX9-NEXT: s_branch .LBB13_3
;
; GFX11-LABEL: bitcast_v30i32_to_v60i16_scalar:
; GFX11: ; %bb.0:
@@ -4735,16 +4786,16 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-NEXT: v_readfirstlane_b32 s10, v6
; GFX11-NEXT: v_readfirstlane_b32 s11, v7
; GFX11-NEXT: v_readfirstlane_b32 s12, v8
-; GFX11-NEXT: v_readfirstlane_b32 s13, v9
+; GFX11-NEXT: v_readfirstlane_b32 s14, v9
; GFX11-NEXT: v_readfirstlane_b32 s15, v10
-; GFX11-NEXT: v_readfirstlane_b32 s14, v11
-; GFX11-NEXT: s_mov_b32 s94, 0
+; GFX11-NEXT: v_readfirstlane_b32 s13, v11
+; GFX11-NEXT: s_mov_b32 s94, -1
; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB13_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -4772,12 +4823,11 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-NEXT: s_lshr_b32 s91, s2, 16
; GFX11-NEXT: s_lshr_b32 s92, s1, 16
; GFX11-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
-; GFX11-NEXT: s_cbranch_vccnz .LBB13_3
+; GFX11-NEXT: s_cbranch_execnz .LBB13_3
; GFX11-NEXT: .LBB13_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s14, s14, 3
-; GFX11-NEXT: s_add_i32 s15, s15, 3
; GFX11-NEXT: s_add_i32 s13, s13, 3
+; GFX11-NEXT: s_add_i32 s15, s15, 3
+; GFX11-NEXT: s_add_i32 s14, s14, 3
; GFX11-NEXT: s_add_i32 s12, s12, 3
; GFX11-NEXT: s_add_i32 s11, s11, 3
; GFX11-NEXT: s_add_i32 s10, s10, 3
@@ -4805,9 +4855,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-NEXT: s_add_i32 s2, s2, 3
; GFX11-NEXT: s_add_i32 s1, s1, 3
; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -4864,9 +4914,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45
; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44
; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42
+; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42
; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40
+; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
@@ -4880,8 +4930,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7
; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9
; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11
-; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13
-; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
+; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14
+; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB13_4:
; GFX11-NEXT: ; implicit-def: $sgpr93
@@ -4914,7 +4964,9 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr41
; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: s_branch .LBB13_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-NEXT: s_cbranch_vccz .LBB13_2
+; GFX11-NEXT: s_branch .LBB13_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6332,6 +6384,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v35, v22
; SI-NEXT: v_mov_b32_e32 v36, v20
; SI-NEXT: v_mov_b32_e32 v37, v18
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5
@@ -6363,7 +6416,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_and_b64 s[6:7], vcc, exec
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10
@@ -6666,7 +6719,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v30, v32
-; SI-NEXT: s_branch .LBB15_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB15_2
+; SI-NEXT: s_branch .LBB15_3
;
; VI-LABEL: bitcast_v60i16_to_v30i32_scalar:
; VI: ; %bb.0:
@@ -6686,6 +6741,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -6702,7 +6758,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB15_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -6949,11 +7005,28 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB15_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB15_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB15_2
+; VI-NEXT: s_branch .LBB15_3
;
; GFX9-LABEL: bitcast_v60i16_to_v30i32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -6970,21 +7043,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -7005,7 +7064,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -7020,6 +7078,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -7165,7 +7224,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB15_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB15_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB15_2
+; GFX9-NEXT: s_branch .LBB15_3
;
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -7210,41 +7271,41 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -7259,17 +7320,16 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -7283,24 +7343,24 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -7317,7 +7377,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB15_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB15_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB15_2
+; GFX11-TRUE16-NEXT: s_branch .LBB15_3
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -7350,41 +7412,41 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -7399,17 +7461,16 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -7423,24 +7484,24 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -7457,7 +7518,9 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB15_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB15_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB15_2
+; GFX11-FAKE16-NEXT: s_branch .LBB15_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8835,6 +8898,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: v_readfirstlane_b32 s44, v2
; SI-NEXT: v_readfirstlane_b32 s43, v3
@@ -8847,11 +8911,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s11, v11
; SI-NEXT: v_readfirstlane_b32 s10, v12
-; SI-NEXT: v_readfirstlane_b32 s8, v13
-; SI-NEXT: v_readfirstlane_b32 s7, v14
-; SI-NEXT: v_readfirstlane_b32 s6, v15
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s9, v16
+; SI-NEXT: v_readfirstlane_b32 s9, v13
+; SI-NEXT: v_readfirstlane_b32 s8, v14
+; SI-NEXT: v_readfirstlane_b32 s7, v15
+; SI-NEXT: v_readfirstlane_b32 s6, v16
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
@@ -8867,13 +8931,13 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB17_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s4, s9, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: s_lshr_b32 s4, s6, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: s_lshr_b32 s4, s7, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
; SI-NEXT: s_lshr_b32 s4, s8, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
; SI-NEXT: s_lshr_b32 s4, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
@@ -8931,10 +8995,10 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: s_lshr_b32 s4, s16, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v60, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s9
; SI-NEXT: v_cvt_f32_f16_e32 v12, s10
; SI-NEXT: v_cvt_f32_f16_e32 v14, s11
; SI-NEXT: v_cvt_f32_f16_e32 v16, s12
@@ -8989,10 +9053,10 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: s_add_i32 s12, s12, 3
; SI-NEXT: s_add_i32 s11, s11, 3
; SI-NEXT: s_add_i32 s10, s10, 3
+; SI-NEXT: s_add_i32 s9, s9, 3
; SI-NEXT: s_add_i32 s8, s8, 3
; SI-NEXT: s_add_i32 s7, s7, 3
; SI-NEXT: s_add_i32 s6, s6, 3
-; SI-NEXT: s_add_i32 s9, s9, 3
; SI-NEXT: s_lshr_b32 s4, s16, 16
; SI-NEXT: s_lshr_b32 s5, s17, 16
; SI-NEXT: s_lshr_b32 s46, s18, 16
@@ -9019,14 +9083,14 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: s_lshr_b32 s91, s12, 16
; SI-NEXT: s_lshr_b32 s92, s11, 16
; SI-NEXT: s_lshr_b32 s93, s10, 16
-; SI-NEXT: s_lshr_b32 s94, s8, 16
-; SI-NEXT: s_lshr_b32 s95, s7, 16
-; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16
-; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s8
+; SI-NEXT: s_lshr_b32 s94, s9, 16
+; SI-NEXT: s_lshr_b32 s95, s8, 16
+; SI-NEXT: s_lshr_b32 vcc_lo, s7, 16
+; SI-NEXT: s_lshr_b32 vcc_hi, s6, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s9
; SI-NEXT: v_cvt_f32_f16_e32 v12, s10
; SI-NEXT: v_cvt_f32_f16_e32 v14, s11
; SI-NEXT: v_cvt_f32_f16_e32 v16, s12
@@ -9373,7 +9437,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_branch .LBB17_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB17_2
+; SI-NEXT: s_branch .LBB17_3
;
; VI-LABEL: bitcast_v30i32_to_v60f16_scalar:
; VI: ; %bb.0:
@@ -9386,8 +9452,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_writelane_b32 v30, s34, 2
; VI-NEXT: v_writelane_b32 v30, s35, 3
; VI-NEXT: v_writelane_b32 v30, s36, 4
-; VI-NEXT: v_writelane_b32 v30, s37, 5
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_writelane_b32 v30, s38, 6
; VI-NEXT: v_readfirstlane_b32 s45, v0
; VI-NEXT: v_readfirstlane_b32 s44, v1
@@ -9403,14 +9470,14 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: v_readfirstlane_b32 s9, v12
; VI-NEXT: v_readfirstlane_b32 s8, v13
-; VI-NEXT: v_readfirstlane_b32 s6, v14
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v15
+; VI-NEXT: v_readfirstlane_b32 s7, v14
+; VI-NEXT: v_readfirstlane_b32 s6, v15
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_writelane_b32 v30, s39, 7
; VI-NEXT: s_cbranch_scc0 .LBB17_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -9441,8 +9508,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: s_lshr_b32 s39, s16, 16
; VI-NEXT: s_cbranch_execnz .LBB17_3
; VI-NEXT: .LBB17_2: ; %cmp.true
-; VI-NEXT: s_add_i32 s7, s7, 3
; VI-NEXT: s_add_i32 s6, s6, 3
+; VI-NEXT: s_add_i32 s7, s7, 3
; VI-NEXT: s_add_i32 s8, s8, 3
; VI-NEXT: s_add_i32 s9, s9, 3
; VI-NEXT: s_add_i32 s10, s10, 3
@@ -9471,8 +9538,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: s_add_i32 s18, s18, 3
; VI-NEXT: s_add_i32 s17, s17, 3
; VI-NEXT: s_add_i32 s16, s16, 3
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -9586,12 +9653,12 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: s_and_b32 s8, 0xffff, s8
; VI-NEXT: s_lshl_b32 s44, s56, 16
; VI-NEXT: s_or_b32 s8, s8, s44
-; VI-NEXT: s_and_b32 s6, 0xffff, s6
-; VI-NEXT: s_lshl_b32 s44, s47, 16
-; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: s_and_b32 s7, 0xffff, s7
-; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_lshl_b32 s44, s47, 16
; VI-NEXT: s_or_b32 s7, s7, s44
+; VI-NEXT: s_and_b32 s6, 0xffff, s6
+; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s16
@@ -9620,8 +9687,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v25, s10
; VI-NEXT: v_mov_b32_e32 v26, s9
; VI-NEXT: v_mov_b32_e32 v27, s8
-; VI-NEXT: v_mov_b32_e32 v28, s6
-; VI-NEXT: v_mov_b32_e32 v29, s7
+; VI-NEXT: v_mov_b32_e32 v28, s7
+; VI-NEXT: v_mov_b32_e32 v29, s6
; VI-NEXT: v_readlane_b32 s39, v30, 7
; VI-NEXT: v_readlane_b32 s38, v30, 6
; VI-NEXT: v_readlane_b32 s37, v30, 5
@@ -9666,7 +9733,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: ; implicit-def: $sgpr47
; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: s_branch .LBB17_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB17_2
+; VI-NEXT: s_branch .LBB17_3
;
; GFX9-LABEL: bitcast_v30i32_to_v60f16_scalar:
; GFX9: ; %bb.0:
@@ -9675,45 +9744,46 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s30, 0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s7, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9-NEXT: v_readfirstlane_b32 s10, v4
-; GFX9-NEXT: v_readfirstlane_b32 s11, v5
-; GFX9-NEXT: v_readfirstlane_b32 s12, v6
-; GFX9-NEXT: v_readfirstlane_b32 s13, v7
-; GFX9-NEXT: v_readfirstlane_b32 s14, v8
-; GFX9-NEXT: v_readfirstlane_b32 s15, v9
-; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_readfirstlane_b32 s41, v11
-; GFX9-NEXT: v_readfirstlane_b32 s42, v12
-; GFX9-NEXT: v_readfirstlane_b32 s43, v13
-; GFX9-NEXT: v_readfirstlane_b32 s44, v14
+; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s9, v2
+; GFX9-NEXT: v_readfirstlane_b32 s10, v3
+; GFX9-NEXT: v_readfirstlane_b32 s11, v4
+; GFX9-NEXT: v_readfirstlane_b32 s12, v5
+; GFX9-NEXT: v_readfirstlane_b32 s13, v6
+; GFX9-NEXT: v_readfirstlane_b32 s14, v7
+; GFX9-NEXT: v_readfirstlane_b32 s15, v8
+; GFX9-NEXT: v_readfirstlane_b32 s40, v9
+; GFX9-NEXT: v_readfirstlane_b32 s41, v10
+; GFX9-NEXT: v_readfirstlane_b32 s42, v11
+; GFX9-NEXT: v_readfirstlane_b32 s43, v12
+; GFX9-NEXT: v_readfirstlane_b32 s44, v13
+; GFX9-NEXT: v_readfirstlane_b32 s45, v14
+; GFX9-NEXT: v_readfirstlane_b32 s6, v15
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_writelane_b32 v30, s35, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB17_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -9730,6 +9800,7 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s35, s16, 16
; GFX9-NEXT: s_cbranch_execnz .LBB17_3
; GFX9-NEXT: .LBB17_2: ; %cmp.true
+; GFX9-NEXT: s_add_i32 s6, s6, 3
; GFX9-NEXT: s_add_i32 s45, s45, 3
; GFX9-NEXT: s_add_i32 s44, s44, 3
; GFX9-NEXT: s_add_i32 s43, s43, 3
@@ -9745,7 +9816,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: s_add_i32 s9, s9, 3
; GFX9-NEXT: s_add_i32 s8, s8, 3
; GFX9-NEXT: s_add_i32 s7, s7, 3
-; GFX9-NEXT: s_add_i32 s6, s6, 3
; GFX9-NEXT: s_add_i32 s29, s29, 3
; GFX9-NEXT: s_add_i32 s28, s28, 3
; GFX9-NEXT: s_add_i32 s27, s27, 3
@@ -9760,22 +9830,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: s_add_i32 s18, s18, 3
; GFX9-NEXT: s_add_i32 s17, s17, 3
; GFX9-NEXT: s_add_i32 s16, s16, 3
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -9805,22 +9875,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88
; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79
; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78
-; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77
-; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76
-; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75
-; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74
-; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73
-; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72
-; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63
-; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62
-; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61
-; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60
-; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59
-; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58
-; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57
-; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
-; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
-; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77
+; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76
+; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75
+; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74
+; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73
+; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72
+; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63
+; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62
+; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61
+; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60
+; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59
+; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58
+; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57
+; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56
+; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47
+; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s16
@@ -9835,22 +9905,22 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v11, s25
; GFX9-NEXT: v_mov_b32_e32 v12, s26
; GFX9-NEXT: v_mov_b32_e32 v13, s27
-; GFX9-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-NEXT: v_mov_b32_e32 v16, s8
-; GFX9-NEXT: v_mov_b32_e32 v17, s9
-; GFX9-NEXT: v_mov_b32_e32 v18, s10
-; GFX9-NEXT: v_mov_b32_e32 v19, s11
-; GFX9-NEXT: v_mov_b32_e32 v20, s12
-; GFX9-NEXT: v_mov_b32_e32 v21, s13
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
-; GFX9-NEXT: v_mov_b32_e32 v23, s15
-; GFX9-NEXT: v_mov_b32_e32 v24, s28
-; GFX9-NEXT: v_mov_b32_e32 v25, s29
-; GFX9-NEXT: v_mov_b32_e32 v26, s40
-; GFX9-NEXT: v_mov_b32_e32 v27, s41
-; GFX9-NEXT: v_mov_b32_e32 v28, s42
-; GFX9-NEXT: v_mov_b32_e32 v29, s43
+; GFX9-NEXT: v_mov_b32_e32 v14, s7
+; GFX9-NEXT: v_mov_b32_e32 v15, s8
+; GFX9-NEXT: v_mov_b32_e32 v16, s9
+; GFX9-NEXT: v_mov_b32_e32 v17, s10
+; GFX9-NEXT: v_mov_b32_e32 v18, s11
+; GFX9-NEXT: v_mov_b32_e32 v19, s12
+; GFX9-NEXT: v_mov_b32_e32 v20, s13
+; GFX9-NEXT: v_mov_b32_e32 v21, s14
+; GFX9-NEXT: v_mov_b32_e32 v22, s15
+; GFX9-NEXT: v_mov_b32_e32 v23, s28
+; GFX9-NEXT: v_mov_b32_e32 v24, s29
+; GFX9-NEXT: v_mov_b32_e32 v25, s40
+; GFX9-NEXT: v_mov_b32_e32 v26, s41
+; GFX9-NEXT: v_mov_b32_e32 v27, s42
+; GFX9-NEXT: v_mov_b32_e32 v28, s43
+; GFX9-NEXT: v_mov_b32_e32 v29, s6
; GFX9-NEXT: v_readlane_b32 s35, v30, 3
; GFX9-NEXT: v_readlane_b32 s34, v30, 2
; GFX9-NEXT: v_readlane_b32 s31, v30, 1
@@ -9891,7 +9961,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX9-NEXT: ; implicit-def: $sgpr56
; GFX9-NEXT: ; implicit-def: $sgpr47
; GFX9-NEXT: ; implicit-def: $sgpr46
-; GFX9-NEXT: s_branch .LBB17_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB17_2
+; GFX9-NEXT: s_branch .LBB17_3
;
; GFX11-LABEL: bitcast_v30i32_to_v60f16_scalar:
; GFX11: ; %bb.0:
@@ -9906,16 +9978,16 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-NEXT: v_readfirstlane_b32 s10, v6
; GFX11-NEXT: v_readfirstlane_b32 s11, v7
; GFX11-NEXT: v_readfirstlane_b32 s12, v8
-; GFX11-NEXT: v_readfirstlane_b32 s13, v9
+; GFX11-NEXT: v_readfirstlane_b32 s14, v9
; GFX11-NEXT: v_readfirstlane_b32 s15, v10
-; GFX11-NEXT: v_readfirstlane_b32 s14, v11
-; GFX11-NEXT: s_mov_b32 s94, 0
+; GFX11-NEXT: v_readfirstlane_b32 s13, v11
+; GFX11-NEXT: s_mov_b32 s94, -1
; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB17_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -9943,12 +10015,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-NEXT: s_lshr_b32 s91, s2, 16
; GFX11-NEXT: s_lshr_b32 s92, s1, 16
; GFX11-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
-; GFX11-NEXT: s_cbranch_vccnz .LBB17_3
+; GFX11-NEXT: s_cbranch_execnz .LBB17_3
; GFX11-NEXT: .LBB17_2: ; %cmp.true
-; GFX11-NEXT: s_add_i32 s14, s14, 3
-; GFX11-NEXT: s_add_i32 s15, s15, 3
; GFX11-NEXT: s_add_i32 s13, s13, 3
+; GFX11-NEXT: s_add_i32 s15, s15, 3
+; GFX11-NEXT: s_add_i32 s14, s14, 3
; GFX11-NEXT: s_add_i32 s12, s12, 3
; GFX11-NEXT: s_add_i32 s11, s11, 3
; GFX11-NEXT: s_add_i32 s10, s10, 3
@@ -9976,9 +10047,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-NEXT: s_add_i32 s2, s2, 3
; GFX11-NEXT: s_add_i32 s1, s1, 3
; GFX11-NEXT: s_add_i32 s0, s0, 3
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -10035,9 +10106,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45
; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44
; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42
+; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42
; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40
+; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
@@ -10051,8 +10122,8 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7
; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9
; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11
-; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13
-; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
+; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14
+; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB17_4:
; GFX11-NEXT: ; implicit-def: $sgpr93
@@ -10085,7 +10156,9 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr41
; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: s_branch .LBB17_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-NEXT: s_cbranch_vccz .LBB17_2
+; GFX11-NEXT: s_branch .LBB17_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11696,11 +11769,11 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
@@ -11714,83 +11787,92 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v2
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v12
; SI-NEXT: v_cvt_f16_f32_e32 v41, v15
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v55, v14
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
; SI-NEXT: v_cvt_f16_f32_e32 v15, v17
; SI-NEXT: v_cvt_f16_f32_e32 v61, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v18
; SI-NEXT: v_cvt_f16_f32_e32 v17, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v20
; SI-NEXT: v_cvt_f16_f32_e32 v18, v23
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v19, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
; SI-NEXT: v_cvt_f16_f32_e32 v20, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v12, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v9, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v8, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v7, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
-; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v54
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v50, s28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v42
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v45
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -11799,260 +11881,240 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v59
; SI-NEXT: v_cvt_f16_f32_e32 v59, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB19_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v21, v19
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v22, v18
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
-; SI-NEXT: v_mov_b32_e32 v33, v32
-; SI-NEXT: v_or_b32_e32 v10, v32, v10
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_or_b32_e32 v13, v43, v13
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v8, v5
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49
-; SI-NEXT: v_or_b32_e32 v7, v37, v7
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v6, v50, v6
+; SI-NEXT: v_mov_b32_e32 v30, v50
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
-; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_or_b32_e32 v4, v9, v4
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_mov_b32_e32 v52, v12
+; SI-NEXT: v_or_b32_e32 v0, v12, v0
+; SI-NEXT: v_or_b32_e32 v1, v11, v1
+; SI-NEXT: v_or_b32_e32 v2, v9, v2
+; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: v_or_b32_e32 v4, v8, v4
+; SI-NEXT: v_or_b32_e32 v5, v7, v5
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_or_b32_e32 v18, v22, v18
-; SI-NEXT: v_or_b32_e32 v19, v21, v19
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48
-; SI-NEXT: v_or_b32_e32 v0, v58, v0
-; SI-NEXT: v_mov_b32_e32 v56, v34
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_or_b32_e32 v8, v49, v8
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
+; SI-NEXT: v_or_b32_e32 v10, v37, v10
+; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v45, v35
; SI-NEXT: v_or_b32_e32 v11, v35, v11
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
-; SI-NEXT: v_or_b32_e32 v12, v62, v12
+; SI-NEXT: v_or_b32_e32 v12, v63, v12
+; SI-NEXT: v_mov_b32_e32 v43, v62
+; SI-NEXT: v_or_b32_e32 v13, v62, v13
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
; SI-NEXT: v_or_b32_e32 v14, v55, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
-; SI-NEXT: v_or_b32_e32 v20, v53, v20
-; SI-NEXT: v_or_b32_e32 v21, v51, v21
-; SI-NEXT: v_or_b32_e32 v22, v30, v22
-; SI-NEXT: v_or_b32_e32 v23, v31, v23
+; SI-NEXT: v_or_b32_e32 v16, v53, v16
+; SI-NEXT: v_or_b32_e32 v17, v51, v17
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_or_b32_e32 v20, v21, v20
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v22, v21
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v23, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v24, v23
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_or_b32_e32 v24, v25, v24
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_or_b32_e32 v17, v32, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_or_b32_e32 v25, v26, v25
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v16, v43, v16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; SI-NEXT: v_or_b32_e32 v26, v27, v26
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v35, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_or_b32_e32 v27, v28, v27
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
-; SI-NEXT: v_or_b32_e32 v9, v39, v9
-; SI-NEXT: v_mov_b32_e32 v36, v37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_or_b32_e32 v28, v29, v28
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v8, v38, v8
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_or_b32_e32 v7, v31, v7
+; SI-NEXT: v_mov_b32_e32 v35, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_or_b32_e32 v29, v54, v29
-; SI-NEXT: v_mov_b32_e32 v54, v32
; SI-NEXT: s_branch .LBB19_3
; SI-NEXT: .LBB19_2:
-; SI-NEXT: v_mov_b32_e32 v54, v53
-; SI-NEXT: v_mov_b32_e32 v53, v52
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v30
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v31
+; SI-NEXT: v_mov_b32_e32 v52, v12
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_mov_b32_e32 v30, v50
-; SI-NEXT: v_mov_b32_e32 v50, v51
-; SI-NEXT: v_mov_b32_e32 v51, v52
-; SI-NEXT: v_mov_b32_e32 v52, v53
-; SI-NEXT: v_mov_b32_e32 v53, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_mov_b32_e32 v56, v34
+; SI-NEXT: v_mov_b32_e32 v45, v35
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
+; SI-NEXT: v_mov_b32_e32 v43, v62
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: v_mov_b32_e32 v31, v48
-; SI-NEXT: v_mov_b32_e32 v48, v49
; SI-NEXT: .LBB19_3: ; %Flow
; SI-NEXT: v_mov_b32_e32 v32, v33
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: v_mov_b32_e32 v61, v40
-; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB19_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v50
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v31
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v45
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v45
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v43
; SI-NEXT: v_mov_b32_e32 v55, v42
; SI-NEXT: v_cvt_f32_f16_e32 v15, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
+; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v48
-; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v51
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -12060,42 +12122,48 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -12108,65 +12176,65 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v35
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v57
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v47
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v44
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v13, v12
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
@@ -12178,14 +12246,14 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_or_b32_e32 v15, v16, v15
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
@@ -12193,9 +12261,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_or_b32_e32 v16, v17, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v17, v19, v17
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
@@ -12205,7 +12273,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_or_b32_e32 v18, v20, v18
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
@@ -12217,32 +12285,39 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_or_b32_e32 v19, v20, v19
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v52
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_or_b32_e32 v20, v22, v20
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v50
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v21, v23, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v30
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_or_b32_e32 v22, v23, v22
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_or_b32_e32 v23, v25, v23
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
@@ -12327,6 +12402,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -12343,7 +12419,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB19_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -12547,11 +12623,28 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB19_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB19_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB19_2
+; VI-NEXT: s_branch .LBB19_3
;
; GFX9-LABEL: bitcast_v60f16_to_v30i32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -12568,21 +12661,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -12603,7 +12682,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -12618,6 +12696,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -12765,7 +12844,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB19_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB19_2
+; GFX9-NEXT: s_branch .LBB19_3
;
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -12810,41 +12891,41 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -12859,17 +12940,16 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_3
; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -12883,24 +12963,24 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -12917,7 +12997,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB19_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB19_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB19_2
+; GFX11-TRUE16-NEXT: s_branch .LBB19_3
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -12950,41 +13032,41 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -12999,17 +13081,16 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_3
; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -13023,24 +13104,24 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -13057,7 +13138,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB19_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB19_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB19_2
+; GFX11-FAKE16-NEXT: s_branch .LBB19_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13255,6 +13338,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -13272,7 +13356,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -13286,10 +13370,13 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB21_4
+; SI-NEXT: s_cbranch_scc0 .LBB21_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB21_3
-; SI-NEXT: .LBB21_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB21_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB21_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
; SI-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -13320,16 +13407,15 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
-; SI-NEXT: .LBB21_3: ; %end
+; SI-NEXT: .LBB21_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB21_4:
-; SI-NEXT: s_branch .LBB21_2
;
; VI-LABEL: bitcast_v30f32_to_v15i64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -13347,7 +13433,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -13361,10 +13447,13 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB21_4
+; VI-NEXT: s_cbranch_scc0 .LBB21_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB21_3
-; VI-NEXT: .LBB21_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB21_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB21_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f32_e32 v29, 1.0, v29
; VI-NEXT: v_add_f32_e32 v28, 1.0, v28
; VI-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -13395,16 +13484,15 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; VI-NEXT: v_add_f32_e32 v2, 1.0, v2
; VI-NEXT: v_add_f32_e32 v1, 1.0, v1
; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
-; VI-NEXT: .LBB21_3: ; %end
+; VI-NEXT: .LBB21_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB21_4:
-; VI-NEXT: s_branch .LBB21_2
;
; GFX9-LABEL: bitcast_v30f32_to_v15i64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -13422,7 +13510,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -13436,10 +13524,13 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB21_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB21_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB21_3
-; GFX9-NEXT: .LBB21_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB21_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB21_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29
; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28
; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -13470,43 +13561,41 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT: .LBB21_3: ; %end
+; GFX9-NEXT: .LBB21_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB21_4:
-; GFX9-NEXT: s_branch .LBB21_2
;
; GFX11-LABEL: bitcast_v30f32_to_v15i64_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB21_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB21_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB21_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB21_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB21_3:
-; GFX11-NEXT: .LBB21_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB21_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
@@ -13522,6 +13611,7 @@ define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT: .LBB21_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13743,6 +13833,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -13760,7 +13851,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -13774,10 +13865,13 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB23_4
+; SI-NEXT: s_cbranch_scc0 .LBB23_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB23_3
-; SI-NEXT: .LBB23_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB23_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB23_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26
@@ -13808,16 +13902,15 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; SI-NEXT: .LBB23_3: ; %end
+; SI-NEXT: .LBB23_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB23_4:
-; SI-NEXT: s_branch .LBB23_2
;
; VI-LABEL: bitcast_v15i64_to_v30f32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -13835,7 +13928,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -13849,10 +13942,13 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB23_4
+; VI-NEXT: s_cbranch_scc0 .LBB23_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB23_3
-; VI-NEXT: .LBB23_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB23_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB23_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28
; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26
@@ -13883,16 +13979,15 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: .LBB23_3: ; %end
+; VI-NEXT: .LBB23_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB23_4:
-; VI-NEXT: s_branch .LBB23_2
;
; GFX9-LABEL: bitcast_v15i64_to_v30f32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -13910,7 +14005,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -13924,10 +14019,13 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB23_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB23_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB23_3
-; GFX9-NEXT: .LBB23_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB23_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28
; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc
; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26
@@ -13958,43 +14056,41 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: .LBB23_3: ; %end
+; GFX9-NEXT: .LBB23_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB23_4:
-; GFX9-NEXT: s_branch .LBB23_2
;
; GFX11-LABEL: bitcast_v15i64_to_v30f32_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB23_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB23_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB23_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB23_3:
-; GFX11-NEXT: .LBB23_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB23_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
@@ -14033,6 +14129,7 @@ define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a,
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT: .LBB23_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14231,6 +14328,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -14248,7 +14346,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -14262,10 +14360,13 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB25_4
+; SI-NEXT: s_cbranch_scc0 .LBB25_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB25_3
-; SI-NEXT: .LBB25_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB25_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB25_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
; SI-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -14296,16 +14397,15 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
-; SI-NEXT: .LBB25_3: ; %end
+; SI-NEXT: .LBB25_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB25_4:
-; SI-NEXT: s_branch .LBB25_2
;
; VI-LABEL: bitcast_v30f32_to_v15f64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -14323,7 +14423,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -14337,10 +14437,13 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB25_4
+; VI-NEXT: s_cbranch_scc0 .LBB25_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB25_3
-; VI-NEXT: .LBB25_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB25_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB25_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f32_e32 v29, 1.0, v29
; VI-NEXT: v_add_f32_e32 v28, 1.0, v28
; VI-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -14371,16 +14474,15 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; VI-NEXT: v_add_f32_e32 v2, 1.0, v2
; VI-NEXT: v_add_f32_e32 v1, 1.0, v1
; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
-; VI-NEXT: .LBB25_3: ; %end
+; VI-NEXT: .LBB25_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB25_4:
-; VI-NEXT: s_branch .LBB25_2
;
; GFX9-LABEL: bitcast_v30f32_to_v15f64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -14398,7 +14500,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -14412,10 +14514,13 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB25_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB25_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB25_3
-; GFX9-NEXT: .LBB25_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB25_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB25_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29
; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28
; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27
@@ -14446,43 +14551,41 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT: .LBB25_3: ; %end
+; GFX9-NEXT: .LBB25_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB25_4:
-; GFX9-NEXT: s_branch .LBB25_2
;
; GFX11-LABEL: bitcast_v30f32_to_v15f64_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB25_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB25_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB25_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB25_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB25_3:
-; GFX11-NEXT: .LBB25_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB25_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
@@ -14498,6 +14601,7 @@ define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %
; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT: .LBB25_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14651,6 +14755,7 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -14679,13 +14784,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; SI-NEXT: v_mov_b32_e32 v9, s25
; SI-NEXT: v_mov_b32_e32 v10, s26
; SI-NEXT: v_mov_b32_e32 v11, s27
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB27_4
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: s_cbranch_scc0 .LBB27_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB27_3
-; SI-NEXT: .LBB27_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB27_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB27_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -14701,17 +14809,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; SI-NEXT: .LBB27_3: ; %end
+; SI-NEXT: .LBB27_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: v_mov_b32_e32 v17, v31
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB27_4:
-; SI-NEXT: s_branch .LBB27_2
;
; VI-LABEL: bitcast_v15f64_to_v30f32_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -14740,13 +14847,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; VI-NEXT: v_mov_b32_e32 v9, s25
; VI-NEXT: v_mov_b32_e32 v10, s26
; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB27_4
+; VI-NEXT: s_mov_b64 s[4:5], -1
+; VI-NEXT: s_cbranch_scc0 .LBB27_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB27_3
-; VI-NEXT: .LBB27_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB27_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB27_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -14762,17 +14872,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; VI-NEXT: .LBB27_3: ; %end
+; VI-NEXT: .LBB27_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: v_mov_b32_e32 v17, v31
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB27_4:
-; VI-NEXT: s_branch .LBB27_2
;
; GFX9-LABEL: bitcast_v15f64_to_v30f32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -14801,13 +14910,16 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; GFX9-NEXT: v_mov_b32_e32 v9, s25
; GFX9-NEXT: v_mov_b32_e32 v10, s26
; GFX9-NEXT: v_mov_b32_e32 v11, s27
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB27_4
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
+; GFX9-NEXT: s_cbranch_scc0 .LBB27_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB27_3
-; GFX9-NEXT: .LBB27_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB27_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB27_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -14823,44 +14935,42 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT: .LBB27_3: ; %end
+; GFX9-NEXT: .LBB27_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: v_mov_b32_e32 v17, v31
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB27_4:
-; GFX9-NEXT: s_branch .LBB27_2
;
; GFX11-LABEL: bitcast_v15f64_to_v30f32_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB27_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB27_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB27_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB27_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB27_3:
-; GFX11-NEXT: .LBB27_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB27_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
@@ -14876,6 +14986,7 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %
; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: .LBB27_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15867,12 +15978,13 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT: v_mov_b32_e32 v30, s16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: v_mov_b32_e32 v29, s16
; SI-NEXT: v_mov_b32_e32 v28, s17
; SI-NEXT: v_mov_b32_e32 v33, s18
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v32, s19
-; SI-NEXT: v_mov_b32_e32 v29, s20
+; SI-NEXT: v_mov_b32_e32 v30, s20
; SI-NEXT: v_mov_b32_e32 v27, s21
; SI-NEXT: v_mov_b32_e32 v25, s22
; SI-NEXT: v_mov_b32_e32 v24, s23
@@ -15909,11 +16021,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16
; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16
; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16
-; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16
+; SI-NEXT: v_alignbit_b32 v44, v27, v30, 16
; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16
; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16
+; SI-NEXT: v_alignbit_b32 v56, v28, v29, 16
; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12
@@ -15936,11 +16048,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: s_cbranch_execnz .LBB29_3
; SI-NEXT: .LBB29_2: ; %cmp.true
; SI-NEXT: v_add_f32_e32 v28, 1.0, v28
-; SI-NEXT: v_add_f32_e32 v30, 1.0, v30
+; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
; SI-NEXT: v_add_f32_e32 v32, 1.0, v32
; SI-NEXT: v_add_f32_e32 v33, 1.0, v33
; SI-NEXT: v_add_f32_e32 v27, 1.0, v27
-; SI-NEXT: v_add_f32_e32 v29, 1.0, v29
+; SI-NEXT: v_add_f32_e32 v30, 1.0, v30
; SI-NEXT: v_add_f32_e32 v24, 1.0, v24
; SI-NEXT: v_add_f32_e32 v25, 1.0, v25
; SI-NEXT: v_add_f32_e32 v21, 1.0, v21
@@ -15977,11 +16089,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16
; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16
; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16
-; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16
+; SI-NEXT: v_alignbit_b32 v44, v27, v30, 16
; SI-NEXT: s_waitcnt expcnt(6)
; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16
; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16
+; SI-NEXT: v_alignbit_b32 v56, v28, v29, 16
; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12
@@ -16002,31 +16114,31 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28
; SI-NEXT: .LBB29_3: ; %end
-; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29
; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56
-; SI-NEXT: v_or_b32_e32 v30, v30, v56
-; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen
+; SI-NEXT: v_or_b32_e32 v29, v29, v56
+; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen
; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v60
-; SI-NEXT: v_or_b32_e32 v28, v28, v30
-; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0
-; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v60
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_add_i32_e32 v29, vcc, 4, v0
+; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46
-; SI-NEXT: v_or_b32_e32 v28, v28, v30
-; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0
-; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v46
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0
+; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v28, 0xffff, v32
-; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59
-; SI-NEXT: v_or_b32_e32 v28, v28, v30
-; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0
-; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen
+; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59
+; SI-NEXT: v_or_b32_e32 v28, v28, v29
+; SI-NEXT: v_add_i32_e32 v29, vcc, 12, v0
+; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; SI-NEXT: v_and_b32_e32 v28, 0xffff, v30
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44
; SI-NEXT: v_or_b32_e32 v28, v28, v29
; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0
@@ -16227,19 +16339,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr39
; SI-NEXT: ; implicit-def: $vgpr22
; SI-NEXT: ; implicit-def: $vgpr37
-; SI-NEXT: s_branch .LBB29_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB29_2
+; SI-NEXT: s_branch .LBB29_3
;
; VI-LABEL: bitcast_v30f32_to_v60i16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v19, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
; VI-NEXT: v_mov_b32_e32 v17, s18
-; VI-NEXT: v_mov_b32_e32 v28, s19
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: v_mov_b32_e32 v27, s19
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v29, s20
-; VI-NEXT: v_mov_b32_e32 v27, s21
+; VI-NEXT: v_mov_b32_e32 v28, s21
; VI-NEXT: v_mov_b32_e32 v26, s22
; VI-NEXT: v_mov_b32_e32 v25, s23
; VI-NEXT: v_mov_b32_e32 v24, s24
@@ -16286,9 +16401,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19
@@ -16318,9 +16433,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_add_f32_e32 v24, 1.0, v24
; VI-NEXT: v_add_f32_e32 v25, 1.0, v25
; VI-NEXT: v_add_f32_e32 v26, 1.0, v26
-; VI-NEXT: v_add_f32_e32 v27, 1.0, v27
-; VI-NEXT: v_add_f32_e32 v29, 1.0, v29
; VI-NEXT: v_add_f32_e32 v28, 1.0, v28
+; VI-NEXT: v_add_f32_e32 v29, 1.0, v29
+; VI-NEXT: v_add_f32_e32 v27, 1.0, v27
; VI-NEXT: v_add_f32_e32 v17, 1.0, v17
; VI-NEXT: v_add_f32_e32 v18, 1.0, v18
; VI-NEXT: v_add_f32_e32 v19, 1.0, v19
@@ -16348,9 +16463,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19
@@ -16362,11 +16477,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34
; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35
-; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v35, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37
-; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38
; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39
@@ -16476,19 +16591,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: s_branch .LBB29_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB29_2
+; VI-NEXT: s_branch .LBB29_3
;
; GFX9-LABEL: bitcast_v30f32_to_v60i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v19, s16
; GFX9-NEXT: v_mov_b32_e32 v18, s17
; GFX9-NEXT: v_mov_b32_e32 v17, s18
-; GFX9-NEXT: v_mov_b32_e32 v28, s19
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: v_mov_b32_e32 v27, s19
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v29, s20
-; GFX9-NEXT: v_mov_b32_e32 v27, s21
+; GFX9-NEXT: v_mov_b32_e32 v28, s21
; GFX9-NEXT: v_mov_b32_e32 v26, s22
; GFX9-NEXT: v_mov_b32_e32 v25, s23
; GFX9-NEXT: v_mov_b32_e32 v24, s24
@@ -16535,9 +16653,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19
@@ -16567,9 +16685,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24
; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25
; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26
-; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27
-; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29
; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28
+; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29
+; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27
; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17
; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18
; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19
@@ -16597,20 +16715,20 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19
; GFX9-NEXT: .LBB29_3: ; %end
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27
; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29
; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28
; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26
; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17
@@ -16725,7 +16843,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: s_branch .LBB29_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB29_2
+; GFX9-NEXT: s_branch .LBB29_3
;
; GFX11-LABEL: bitcast_v30f32_to_v60i16_scalar:
; GFX11: ; %bb.0:
@@ -16738,10 +16858,10 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s27
+; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v15, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_cbranch_scc0 .LBB29_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
@@ -16756,9 +16876,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17
; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
@@ -16774,8 +16894,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB29_3
+; GFX11-NEXT: s_cbranch_execnz .LBB29_3
; GFX11-NEXT: .LBB29_2: ; %cmp.true
; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
@@ -16783,8 +16902,8 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14
; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
@@ -16804,9 +16923,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17
; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
@@ -16837,7 +16956,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19
; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v15
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
@@ -16853,7 +16972,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14
; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v17
; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18
; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
@@ -16865,7 +16984,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
@@ -16881,7 +17000,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16
; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
@@ -16924,7 +17043,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: ; implicit-def: $vgpr52
; GFX11-NEXT: ; implicit-def: $vgpr51
; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB29_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_vccz .LBB29_2
+; GFX11-NEXT: s_branch .LBB29_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -18342,6 +18463,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; SI-NEXT: v_mov_b32_e32 v35, v22
; SI-NEXT: v_mov_b32_e32 v36, v20
; SI-NEXT: v_mov_b32_e32 v37, v18
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5
@@ -18373,7 +18495,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_and_b64 s[6:7], vcc, exec
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10
@@ -18676,7 +18798,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v30, v32
-; SI-NEXT: s_branch .LBB31_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB31_2
+; SI-NEXT: s_branch .LBB31_3
;
; VI-LABEL: bitcast_v60i16_to_v30f32_scalar:
; VI: ; %bb.0:
@@ -18696,6 +18820,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -18712,7 +18837,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB31_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -18959,11 +19084,28 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB31_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB31_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB31_2
+; VI-NEXT: s_branch .LBB31_3
;
; GFX9-LABEL: bitcast_v60i16_to_v30f32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -18980,21 +19122,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -19015,7 +19143,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -19030,6 +19157,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -19175,7 +19303,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB31_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB31_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB31_2
+; GFX9-NEXT: s_branch .LBB31_3
;
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -19220,41 +19350,41 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -19269,17 +19399,16 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_3
; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -19293,24 +19422,24 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -19327,7 +19456,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB31_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB31_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB31_2
+; GFX11-TRUE16-NEXT: s_branch .LBB31_3
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -19360,41 +19491,41 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -19409,17 +19540,16 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_3
; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -19433,24 +19563,24 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -19467,7 +19597,9 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB31_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB31_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB31_2
+; GFX11-FAKE16-NEXT: s_branch .LBB31_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -20815,6 +20947,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: v_readfirstlane_b32 s44, v2
; SI-NEXT: v_readfirstlane_b32 s43, v3
@@ -20827,11 +20960,11 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_readfirstlane_b32 s12, v10
; SI-NEXT: v_readfirstlane_b32 s11, v11
; SI-NEXT: v_readfirstlane_b32 s10, v12
-; SI-NEXT: v_readfirstlane_b32 s8, v13
-; SI-NEXT: v_readfirstlane_b32 s7, v14
-; SI-NEXT: v_readfirstlane_b32 s6, v15
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s9, v16
+; SI-NEXT: v_readfirstlane_b32 s9, v13
+; SI-NEXT: v_readfirstlane_b32 s8, v14
+; SI-NEXT: v_readfirstlane_b32 s7, v15
+; SI-NEXT: v_readfirstlane_b32 s6, v16
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -20850,14 +20983,14 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB33_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s4, s9, 16
+; SI-NEXT: s_lshr_b32 s4, s6, 16
; SI-NEXT: s_waitcnt expcnt(4)
; SI-NEXT: v_cvt_f32_f16_e32 v59, s4
-; SI-NEXT: s_lshr_b32 s4, s6, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v42, s4
; SI-NEXT: s_lshr_b32 s4, s7, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v54, s4
+; SI-NEXT: v_cvt_f32_f16_e32 v42, s4
; SI-NEXT: s_lshr_b32 s4, s8, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v54, s4
+; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: v_cvt_f32_f16_e32 v50, s4
; SI-NEXT: s_lshr_b32 s4, s10, 16
; SI-NEXT: v_cvt_f32_f16_e32 v57, s4
@@ -20912,10 +21045,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: s_lshr_b32 s4, s16, 16
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: v_cvt_f32_f16_e32 v60, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v14, s9
-; SI-NEXT: v_cvt_f32_f16_e32 v38, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v45, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v47, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v14, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v38, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v45, s8
+; SI-NEXT: v_cvt_f32_f16_e32 v47, s9
; SI-NEXT: v_cvt_f32_f16_e32 v1, s10
; SI-NEXT: s_waitcnt expcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v61, s11
@@ -20969,7 +21102,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; SI-NEXT: v_add_f32_e64 v11, s22, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16
-; SI-NEXT: v_add_f32_e64 v40, s6, 1.0
+; SI-NEXT: v_add_f32_e64 v40, s7, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40
; SI-NEXT: v_cvt_f32_f16_e32 v38, v40
@@ -20983,7 +21116,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_add_f32_e64 v19, s26, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28
; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23
-; SI-NEXT: v_add_f32_e64 v48, s8, 1.0
+; SI-NEXT: v_add_f32_e64 v48, s9, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19
; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48
; SI-NEXT: v_cvt_f32_f16_e32 v47, v48
@@ -21032,8 +21165,8 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: v_add_f32_e64 v21, s15, 1.0
; SI-NEXT: v_add_f32_e64 v20, s14, 1.0
; SI-NEXT: v_add_f32_e64 v33, s11, 1.0
-; SI-NEXT: v_add_f32_e64 v52, s7, 1.0
-; SI-NEXT: v_add_f32_e64 v44, s9, 1.0
+; SI-NEXT: v_add_f32_e64 v52, s8, 1.0
+; SI-NEXT: v_add_f32_e64 v44, s6, 1.0
; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21
@@ -21375,19 +21508,22 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr42
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: s_branch .LBB33_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB33_2
+; SI-NEXT: s_branch .LBB33_3
;
; VI-LABEL: bitcast_v30f32_to_v60f16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v19, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
; VI-NEXT: v_mov_b32_e32 v17, s18
-; VI-NEXT: v_mov_b32_e32 v28, s19
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: v_mov_b32_e32 v27, s19
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v29, s20
-; VI-NEXT: v_mov_b32_e32 v27, s21
+; VI-NEXT: v_mov_b32_e32 v28, s21
; VI-NEXT: v_mov_b32_e32 v26, s22
; VI-NEXT: v_mov_b32_e32 v25, s23
; VI-NEXT: v_mov_b32_e32 v24, s24
@@ -21434,9 +21570,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19
@@ -21466,9 +21602,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_add_f32_e32 v24, 1.0, v24
; VI-NEXT: v_add_f32_e32 v25, 1.0, v25
; VI-NEXT: v_add_f32_e32 v26, 1.0, v26
-; VI-NEXT: v_add_f32_e32 v27, 1.0, v27
-; VI-NEXT: v_add_f32_e32 v29, 1.0, v29
; VI-NEXT: v_add_f32_e32 v28, 1.0, v28
+; VI-NEXT: v_add_f32_e32 v29, 1.0, v29
+; VI-NEXT: v_add_f32_e32 v27, 1.0, v27
; VI-NEXT: v_add_f32_e32 v17, 1.0, v17
; VI-NEXT: v_add_f32_e32 v18, 1.0, v18
; VI-NEXT: v_add_f32_e32 v19, 1.0, v19
@@ -21496,9 +21632,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19
@@ -21510,11 +21646,11 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34
; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35
-; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v35, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36
; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37
-; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38
; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39
@@ -21624,19 +21760,22 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: s_branch .LBB33_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB33_2
+; VI-NEXT: s_branch .LBB33_3
;
; GFX9-LABEL: bitcast_v30f32_to_v60f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v19, s16
; GFX9-NEXT: v_mov_b32_e32 v18, s17
; GFX9-NEXT: v_mov_b32_e32 v17, s18
-; GFX9-NEXT: v_mov_b32_e32 v28, s19
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: v_mov_b32_e32 v27, s19
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v29, s20
-; GFX9-NEXT: v_mov_b32_e32 v27, s21
+; GFX9-NEXT: v_mov_b32_e32 v28, s21
; GFX9-NEXT: v_mov_b32_e32 v26, s22
; GFX9-NEXT: v_mov_b32_e32 v25, s23
; GFX9-NEXT: v_mov_b32_e32 v24, s24
@@ -21683,9 +21822,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19
@@ -21715,9 +21854,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24
; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25
; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26
-; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27
-; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29
; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28
+; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29
+; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27
; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17
; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18
; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19
@@ -21745,20 +21884,20 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24
; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25
; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26
-; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v27
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19
; GFX9-NEXT: .LBB33_3: ; %end
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27
; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29
; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17
-; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28
; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26
; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17
@@ -21873,7 +22012,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: s_branch .LBB33_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB33_2
+; GFX9-NEXT: s_branch .LBB33_3
;
; GFX11-LABEL: bitcast_v30f32_to_v60f16_scalar:
; GFX11: ; %bb.0:
@@ -21886,10 +22027,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
; GFX11-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s27
+; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v15, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_cbranch_scc0 .LBB33_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
@@ -21904,9 +22045,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17
; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
@@ -21922,8 +22063,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v29
; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB33_3
+; GFX11-NEXT: s_cbranch_execnz .LBB33_3
; GFX11-NEXT: .LBB33_2: ; %cmp.true
; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
@@ -21931,8 +22071,8 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14
; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
@@ -21952,9 +22092,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v17
; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v13
; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v14
; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v18
@@ -21985,7 +22125,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshl_or_b32 v35, v35, 16, v19
; GFX11-NEXT: v_lshl_or_b32 v12, v12, 16, v18
; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v15
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshl_or_b32 v19, v68, 16, v1
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3
@@ -22001,7 +22141,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20
; GFX11-NEXT: v_lshl_or_b32 v13, v82, 16, v14
; GFX11-NEXT: v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v17
; GFX11-NEXT: v_lshl_or_b32 v17, v70, 16, v18
; GFX11-NEXT: v_lshl_or_b32 v18, v69, 16, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
@@ -22013,7 +22153,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshl_or_b32 v38, v38, 16, v28
; GFX11-NEXT: v_lshl_or_b32 v32, v32, 16, v22
; GFX11-NEXT: v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX11-NEXT: v_lshl_or_b32 v20, v67, 16, v0
; GFX11-NEXT: v_lshl_or_b32 v22, v65, 16, v2
; GFX11-NEXT: v_lshl_or_b32 v23, v64, 16, v3
@@ -22029,7 +22169,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: v_lshl_or_b32 v30, v83, 16, v24
; GFX11-NEXT: v_lshl_or_b32 v24, v55, 16, v4
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT: v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-NEXT: v_lshl_or_b32 v16, v71, 16, v16
; GFX11-NEXT: v_lshl_or_b32 v25, v54, 16, v0
; GFX11-NEXT: v_lshl_or_b32 v27, v52, 16, v2
; GFX11-NEXT: v_lshl_or_b32 v28, v51, 16, v3
@@ -22072,7 +22212,9 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
; GFX11-NEXT: ; implicit-def: $vgpr52
; GFX11-NEXT: ; implicit-def: $vgpr51
; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB33_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_vccz .LBB33_2
+; GFX11-NEXT: s_branch .LBB33_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -23683,11 +23825,11 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
@@ -23701,83 +23843,92 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v2
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v12
; SI-NEXT: v_cvt_f16_f32_e32 v41, v15
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v55, v14
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
; SI-NEXT: v_cvt_f16_f32_e32 v15, v17
; SI-NEXT: v_cvt_f16_f32_e32 v61, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v18
; SI-NEXT: v_cvt_f16_f32_e32 v17, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v20
; SI-NEXT: v_cvt_f16_f32_e32 v18, v23
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v19, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
; SI-NEXT: v_cvt_f16_f32_e32 v20, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v12, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v9, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v8, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v7, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
-; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v54
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v50, s28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v42
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v45
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -23786,260 +23937,240 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v59
; SI-NEXT: v_cvt_f16_f32_e32 v59, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB35_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v21, v19
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v22, v18
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
-; SI-NEXT: v_mov_b32_e32 v33, v32
-; SI-NEXT: v_or_b32_e32 v10, v32, v10
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_or_b32_e32 v13, v43, v13
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v8, v5
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49
-; SI-NEXT: v_or_b32_e32 v7, v37, v7
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v6, v50, v6
+; SI-NEXT: v_mov_b32_e32 v30, v50
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
-; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_or_b32_e32 v4, v9, v4
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_mov_b32_e32 v52, v12
+; SI-NEXT: v_or_b32_e32 v0, v12, v0
+; SI-NEXT: v_or_b32_e32 v1, v11, v1
+; SI-NEXT: v_or_b32_e32 v2, v9, v2
+; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: v_or_b32_e32 v4, v8, v4
+; SI-NEXT: v_or_b32_e32 v5, v7, v5
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_or_b32_e32 v18, v22, v18
-; SI-NEXT: v_or_b32_e32 v19, v21, v19
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48
-; SI-NEXT: v_or_b32_e32 v0, v58, v0
-; SI-NEXT: v_mov_b32_e32 v56, v34
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_or_b32_e32 v8, v49, v8
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
+; SI-NEXT: v_or_b32_e32 v10, v37, v10
+; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v45, v35
; SI-NEXT: v_or_b32_e32 v11, v35, v11
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
-; SI-NEXT: v_or_b32_e32 v12, v62, v12
+; SI-NEXT: v_or_b32_e32 v12, v63, v12
+; SI-NEXT: v_mov_b32_e32 v43, v62
+; SI-NEXT: v_or_b32_e32 v13, v62, v13
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
; SI-NEXT: v_or_b32_e32 v14, v55, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
-; SI-NEXT: v_or_b32_e32 v20, v53, v20
-; SI-NEXT: v_or_b32_e32 v21, v51, v21
-; SI-NEXT: v_or_b32_e32 v22, v30, v22
-; SI-NEXT: v_or_b32_e32 v23, v31, v23
+; SI-NEXT: v_or_b32_e32 v16, v53, v16
+; SI-NEXT: v_or_b32_e32 v17, v51, v17
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_or_b32_e32 v20, v21, v20
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v22, v21
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v23, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v24, v23
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_or_b32_e32 v24, v25, v24
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_or_b32_e32 v17, v32, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_or_b32_e32 v25, v26, v25
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v16, v43, v16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; SI-NEXT: v_or_b32_e32 v26, v27, v26
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v35, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_or_b32_e32 v27, v28, v27
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
-; SI-NEXT: v_or_b32_e32 v9, v39, v9
-; SI-NEXT: v_mov_b32_e32 v36, v37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_or_b32_e32 v28, v29, v28
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v8, v38, v8
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_or_b32_e32 v7, v31, v7
+; SI-NEXT: v_mov_b32_e32 v35, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_or_b32_e32 v29, v54, v29
-; SI-NEXT: v_mov_b32_e32 v54, v32
; SI-NEXT: s_branch .LBB35_3
; SI-NEXT: .LBB35_2:
-; SI-NEXT: v_mov_b32_e32 v54, v53
-; SI-NEXT: v_mov_b32_e32 v53, v52
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v30
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v31
+; SI-NEXT: v_mov_b32_e32 v52, v12
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_mov_b32_e32 v30, v50
-; SI-NEXT: v_mov_b32_e32 v50, v51
-; SI-NEXT: v_mov_b32_e32 v51, v52
-; SI-NEXT: v_mov_b32_e32 v52, v53
-; SI-NEXT: v_mov_b32_e32 v53, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_mov_b32_e32 v56, v34
+; SI-NEXT: v_mov_b32_e32 v45, v35
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
+; SI-NEXT: v_mov_b32_e32 v43, v62
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: v_mov_b32_e32 v31, v48
-; SI-NEXT: v_mov_b32_e32 v48, v49
; SI-NEXT: .LBB35_3: ; %Flow
; SI-NEXT: v_mov_b32_e32 v32, v33
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: v_mov_b32_e32 v61, v40
-; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB35_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v50
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v31
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v45
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v45
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v43
; SI-NEXT: v_mov_b32_e32 v55, v42
; SI-NEXT: v_cvt_f32_f16_e32 v15, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
+; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v48
-; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v51
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -24047,42 +24178,48 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -24095,65 +24232,65 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v35
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v57
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v47
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v44
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v13, v12
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
@@ -24165,14 +24302,14 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_or_b32_e32 v15, v16, v15
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
@@ -24180,9 +24317,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_or_b32_e32 v16, v17, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v17, v19, v17
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
@@ -24192,7 +24329,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_or_b32_e32 v18, v20, v18
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
@@ -24204,32 +24341,39 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_or_b32_e32 v19, v20, v19
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v52
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_or_b32_e32 v20, v22, v20
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v50
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v21, v23, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v30
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_or_b32_e32 v22, v23, v22
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_or_b32_e32 v23, v25, v23
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
@@ -24314,6 +24458,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -24330,7 +24475,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB35_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -24534,11 +24679,28 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB35_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB35_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB35_2
+; VI-NEXT: s_branch .LBB35_3
;
; GFX9-LABEL: bitcast_v60f16_to_v30f32_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -24555,21 +24717,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -24590,7 +24738,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -24605,6 +24752,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -24752,7 +24900,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB35_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB35_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB35_2
+; GFX9-NEXT: s_branch .LBB35_3
;
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -24797,41 +24947,41 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -24846,17 +24996,16 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3
; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -24870,24 +25019,24 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -24904,7 +25053,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB35_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB35_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB35_2
+; GFX11-TRUE16-NEXT: s_branch .LBB35_3
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -24937,41 +25088,41 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -24986,17 +25137,16 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3
; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -25010,24 +25160,24 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -25044,7 +25194,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB35_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB35_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB35_2
+; GFX11-FAKE16-NEXT: s_branch .LBB35_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -25265,6 +25417,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -25282,7 +25435,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; SI-NEXT: v_mov_b32_e32 v15, v1
; SI-NEXT: v_mov_b32_e32 v14, v0
; SI-NEXT: v_mov_b32_e32 v0, s16
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_mov_b32_e32 v1, s17
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_mov_b32_e32 v3, s19
@@ -25296,10 +25449,13 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; SI-NEXT: v_mov_b32_e32 v11, s27
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB37_4
+; SI-NEXT: s_cbranch_scc0 .LBB37_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB37_3
-; SI-NEXT: .LBB37_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB37_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB37_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2
@@ -25330,16 +25486,15 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc
; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28
; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
-; SI-NEXT: .LBB37_3: ; %end
+; SI-NEXT: .LBB37_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB37_4:
-; SI-NEXT: s_branch .LBB37_2
;
; VI-LABEL: bitcast_v15i64_to_v15f64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -25357,7 +25512,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; VI-NEXT: v_mov_b32_e32 v15, v1
; VI-NEXT: v_mov_b32_e32 v14, v0
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
@@ -25371,10 +25526,13 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; VI-NEXT: v_mov_b32_e32 v11, s27
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB37_4
+; VI-NEXT: s_cbranch_scc0 .LBB37_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB37_3
-; VI-NEXT: .LBB37_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB37_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB37_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
@@ -25405,16 +25563,15 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc
; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28
; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
-; VI-NEXT: .LBB37_3: ; %end
+; VI-NEXT: .LBB37_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB37_4:
-; VI-NEXT: s_branch .LBB37_2
;
; GFX9-LABEL: bitcast_v15i64_to_v15f64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -25432,7 +25589,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v15, v1
; GFX9-NEXT: v_mov_b32_e32 v14, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
@@ -25446,10 +25603,13 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v11, s27
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB37_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB37_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB37_3
-; GFX9-NEXT: .LBB37_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB37_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB37_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2
@@ -25480,43 +25640,41 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc
; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28
; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc
-; GFX9-NEXT: .LBB37_3: ; %end
+; GFX9-NEXT: .LBB37_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB37_4:
-; GFX9-NEXT: s_branch .LBB37_2
;
; GFX11-LABEL: bitcast_v15i64_to_v15f64_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB37_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB37_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB37_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB37_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB37_3:
-; GFX11-NEXT: .LBB37_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB37_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -25555,6 +25713,7 @@ define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a,
; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
+; GFX11-NEXT: .LBB37_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -25708,6 +25867,7 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v29, v15
; SI-NEXT: v_mov_b32_e32 v28, v14
; SI-NEXT: v_mov_b32_e32 v27, v13
@@ -25736,13 +25896,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v9, s25
; SI-NEXT: v_mov_b32_e32 v10, s26
; SI-NEXT: v_mov_b32_e32 v11, s27
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v12, s28
; SI-NEXT: v_mov_b32_e32 v13, s29
-; SI-NEXT: s_cbranch_scc0 .LBB39_4
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: s_cbranch_scc0 .LBB39_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_cbranch_execnz .LBB39_3
-; SI-NEXT: .LBB39_2: ; %cmp.true
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: .LBB39_2: ; %Flow
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccnz .LBB39_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
@@ -25758,17 +25921,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; SI-NEXT: .LBB39_3: ; %end
+; SI-NEXT: .LBB39_4: ; %end
; SI-NEXT: v_mov_b32_e32 v16, v30
; SI-NEXT: v_mov_b32_e32 v17, v31
; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB39_4:
-; SI-NEXT: s_branch .LBB39_2
;
; VI-LABEL: bitcast_v15f64_to_v15i64_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v29, v15
; VI-NEXT: v_mov_b32_e32 v28, v14
; VI-NEXT: v_mov_b32_e32 v27, v13
@@ -25797,13 +25959,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v9, s25
; VI-NEXT: v_mov_b32_e32 v10, s26
; VI-NEXT: v_mov_b32_e32 v11, s27
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v12, s28
; VI-NEXT: v_mov_b32_e32 v13, s29
-; VI-NEXT: s_cbranch_scc0 .LBB39_4
+; VI-NEXT: s_mov_b64 s[4:5], -1
+; VI-NEXT: s_cbranch_scc0 .LBB39_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB39_3
-; VI-NEXT: .LBB39_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB39_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB39_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
@@ -25819,17 +25984,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; VI-NEXT: .LBB39_3: ; %end
+; VI-NEXT: .LBB39_4: ; %end
; VI-NEXT: v_mov_b32_e32 v16, v30
; VI-NEXT: v_mov_b32_e32 v17, v31
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB39_4:
-; VI-NEXT: s_branch .LBB39_2
;
; GFX9-LABEL: bitcast_v15f64_to_v15i64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v29, v15
; GFX9-NEXT: v_mov_b32_e32 v28, v14
; GFX9-NEXT: v_mov_b32_e32 v27, v13
@@ -25858,13 +26022,16 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v9, s25
; GFX9-NEXT: v_mov_b32_e32 v10, s26
; GFX9-NEXT: v_mov_b32_e32 v11, s27
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v12, s28
; GFX9-NEXT: v_mov_b32_e32 v13, s29
-; GFX9-NEXT: s_cbranch_scc0 .LBB39_4
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
+; GFX9-NEXT: s_cbranch_scc0 .LBB39_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB39_3
-; GFX9-NEXT: .LBB39_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB39_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB39_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
@@ -25880,44 +26047,42 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; GFX9-NEXT: .LBB39_3: ; %end
+; GFX9-NEXT: .LBB39_4: ; %end
; GFX9-NEXT: v_mov_b32_e32 v16, v30
; GFX9-NEXT: v_mov_b32_e32 v17, v31
; GFX9-NEXT: s_setpc_b64 s[30:31]
-; GFX9-NEXT: .LBB39_4:
-; GFX9-NEXT: s_branch .LBB39_2
;
; GFX11-LABEL: bitcast_v15f64_to_v15i64_scalar:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
-; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
-; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v28, v10
+; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v26, v8
+; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v24, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
-; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
-; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
-; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: v_mov_b32_e32 v16, s28
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v22, v4
+; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v20, v2
+; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v18, v0
+; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s16
+; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s18
+; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s20
+; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX11-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s24
+; GFX11-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s26
+; GFX11-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s28
+; GFX11-NEXT: v_mov_b32_e32 v17, s29
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
+; GFX11-NEXT: s_cbranch_scc0 .LBB39_2
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT: s_cbranch_scc0 .LBB39_3
-; GFX11-NEXT: ; %bb.1: ; %Flow
+; GFX11-NEXT: .LBB39_2: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccz .LBB39_4
-; GFX11-NEXT: ; %bb.2: ; %end
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-; GFX11-NEXT: .LBB39_3:
-; GFX11-NEXT: .LBB39_4: ; %cmp.true
+; GFX11-NEXT: s_cbranch_vccnz .LBB39_4
+; GFX11-NEXT: ; %bb.3: ; %cmp.true
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
@@ -25933,6 +26098,7 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a,
; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-NEXT: .LBB39_4: ; %end
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -26970,6 +27136,7 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s45, v1
; SI-NEXT: v_readfirstlane_b32 s44, v2
; SI-NEXT: v_readfirstlane_b32 s43, v3
@@ -26985,8 +27152,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s9, v13
; SI-NEXT: v_readfirstlane_b32 s8, v14
; SI-NEXT: v_readfirstlane_b32 s7, v15
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s6, v16
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: s_cbranch_scc0 .LBB41_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_mov_b32_e32 v1, s7
@@ -27324,7 +27491,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr47
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: s_branch .LBB41_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB41_2
+; SI-NEXT: s_branch .LBB41_3
;
; VI-LABEL: bitcast_v15i64_to_v60i16_scalar:
; VI: ; %bb.0:
@@ -27337,8 +27506,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_writelane_b32 v30, s34, 2
; VI-NEXT: v_writelane_b32 v30, s35, 3
; VI-NEXT: v_writelane_b32 v30, s36, 4
-; VI-NEXT: v_writelane_b32 v30, s37, 5
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_writelane_b32 v30, s38, 6
; VI-NEXT: v_readfirstlane_b32 s45, v0
; VI-NEXT: v_readfirstlane_b32 s44, v1
@@ -27354,14 +27524,14 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: v_readfirstlane_b32 s9, v12
; VI-NEXT: v_readfirstlane_b32 s8, v13
-; VI-NEXT: v_readfirstlane_b32 s6, v14
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v15
+; VI-NEXT: v_readfirstlane_b32 s7, v14
+; VI-NEXT: v_readfirstlane_b32 s6, v15
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_writelane_b32 v30, s39, 7
; VI-NEXT: s_cbranch_scc0 .LBB41_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -27392,8 +27562,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: s_lshr_b32 s39, s16, 16
; VI-NEXT: s_cbranch_execnz .LBB41_3
; VI-NEXT: .LBB41_2: ; %cmp.true
-; VI-NEXT: s_add_u32 s6, s6, 3
-; VI-NEXT: s_addc_u32 s7, s7, 0
+; VI-NEXT: s_add_u32 s7, s7, 3
+; VI-NEXT: s_addc_u32 s6, s6, 0
; VI-NEXT: s_add_u32 s9, s9, 3
; VI-NEXT: s_addc_u32 s8, s8, 0
; VI-NEXT: s_add_u32 s11, s11, 3
@@ -27422,8 +27592,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: s_addc_u32 s19, s19, 0
; VI-NEXT: s_add_u32 s16, s16, 3
; VI-NEXT: s_addc_u32 s17, s17, 0
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -27537,12 +27707,12 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: s_and_b32 s8, 0xffff, s8
; VI-NEXT: s_lshl_b32 s44, s56, 16
; VI-NEXT: s_or_b32 s8, s8, s44
-; VI-NEXT: s_and_b32 s6, 0xffff, s6
-; VI-NEXT: s_lshl_b32 s44, s47, 16
-; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: s_and_b32 s7, 0xffff, s7
-; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_lshl_b32 s44, s47, 16
; VI-NEXT: s_or_b32 s7, s7, s44
+; VI-NEXT: s_and_b32 s6, 0xffff, s6
+; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s16
@@ -27571,8 +27741,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v25, s10
; VI-NEXT: v_mov_b32_e32 v26, s9
; VI-NEXT: v_mov_b32_e32 v27, s8
-; VI-NEXT: v_mov_b32_e32 v28, s6
-; VI-NEXT: v_mov_b32_e32 v29, s7
+; VI-NEXT: v_mov_b32_e32 v28, s7
+; VI-NEXT: v_mov_b32_e32 v29, s6
; VI-NEXT: v_readlane_b32 s39, v30, 7
; VI-NEXT: v_readlane_b32 s38, v30, 6
; VI-NEXT: v_readlane_b32 s37, v30, 5
@@ -27617,7 +27787,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: ; implicit-def: $sgpr47
; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: s_branch .LBB41_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB41_2
+; VI-NEXT: s_branch .LBB41_3
;
; GFX9-LABEL: bitcast_v15i64_to_v60i16_scalar:
; GFX9: ; %bb.0:
@@ -27626,45 +27798,46 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s30, 0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s7, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9-NEXT: v_readfirstlane_b32 s10, v4
-; GFX9-NEXT: v_readfirstlane_b32 s11, v5
-; GFX9-NEXT: v_readfirstlane_b32 s12, v6
-; GFX9-NEXT: v_readfirstlane_b32 s13, v7
-; GFX9-NEXT: v_readfirstlane_b32 s14, v8
-; GFX9-NEXT: v_readfirstlane_b32 s15, v9
-; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_readfirstlane_b32 s41, v11
-; GFX9-NEXT: v_readfirstlane_b32 s42, v12
-; GFX9-NEXT: v_readfirstlane_b32 s43, v13
-; GFX9-NEXT: v_readfirstlane_b32 s44, v14
+; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s9, v2
+; GFX9-NEXT: v_readfirstlane_b32 s10, v3
+; GFX9-NEXT: v_readfirstlane_b32 s11, v4
+; GFX9-NEXT: v_readfirstlane_b32 s12, v5
+; GFX9-NEXT: v_readfirstlane_b32 s13, v6
+; GFX9-NEXT: v_readfirstlane_b32 s14, v7
+; GFX9-NEXT: v_readfirstlane_b32 s15, v8
+; GFX9-NEXT: v_readfirstlane_b32 s40, v9
+; GFX9-NEXT: v_readfirstlane_b32 s41, v10
+; GFX9-NEXT: v_readfirstlane_b32 s42, v11
+; GFX9-NEXT: v_readfirstlane_b32 s43, v12
+; GFX9-NEXT: v_readfirstlane_b32 s44, v13
+; GFX9-NEXT: v_readfirstlane_b32 s45, v14
+; GFX9-NEXT: v_readfirstlane_b32 s6, v15
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_writelane_b32 v30, s35, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB41_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -27681,22 +27854,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: s_lshr_b32 s35, s16, 16
; GFX9-NEXT: s_cbranch_execnz .LBB41_3
; GFX9-NEXT: .LBB41_2: ; %cmp.true
-; GFX9-NEXT: s_add_u32 s44, s44, 3
-; GFX9-NEXT: s_addc_u32 s45, s45, 0
-; GFX9-NEXT: s_add_u32 s42, s42, 3
-; GFX9-NEXT: s_addc_u32 s43, s43, 0
-; GFX9-NEXT: s_add_u32 s40, s40, 3
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_add_u32 s14, s14, 3
-; GFX9-NEXT: s_addc_u32 s15, s15, 0
-; GFX9-NEXT: s_add_u32 s12, s12, 3
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_add_u32 s10, s10, 3
-; GFX9-NEXT: s_addc_u32 s11, s11, 0
-; GFX9-NEXT: s_add_u32 s8, s8, 3
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_add_u32 s6, s6, 3
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
+; GFX9-NEXT: s_add_u32 s45, s45, 3
+; GFX9-NEXT: s_addc_u32 s6, s6, 0
+; GFX9-NEXT: s_add_u32 s43, s43, 3
+; GFX9-NEXT: s_addc_u32 s44, s44, 0
+; GFX9-NEXT: s_add_u32 s41, s41, 3
+; GFX9-NEXT: s_addc_u32 s42, s42, 0
+; GFX9-NEXT: s_add_u32 s15, s15, 3
+; GFX9-NEXT: s_addc_u32 s40, s40, 0
+; GFX9-NEXT: s_add_u32 s13, s13, 3
+; GFX9-NEXT: s_addc_u32 s14, s14, 0
+; GFX9-NEXT: s_add_u32 s11, s11, 3
+; GFX9-NEXT: s_addc_u32 s12, s12, 0
+; GFX9-NEXT: s_add_u32 s9, s9, 3
+; GFX9-NEXT: s_addc_u32 s10, s10, 0
+; GFX9-NEXT: s_add_u32 s7, s7, 3
+; GFX9-NEXT: s_addc_u32 s8, s8, 0
; GFX9-NEXT: s_add_u32 s28, s28, 3
; GFX9-NEXT: s_addc_u32 s29, s29, 0
; GFX9-NEXT: s_add_u32 s26, s26, 3
@@ -27711,22 +27884,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: s_addc_u32 s19, s19, 0
; GFX9-NEXT: s_add_u32 s16, s16, 3
; GFX9-NEXT: s_addc_u32 s17, s17, 0
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -27756,22 +27929,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88
; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79
; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78
-; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77
-; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76
-; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75
-; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74
-; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73
-; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72
-; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63
-; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62
-; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61
-; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60
-; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59
-; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58
-; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57
-; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
-; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
-; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77
+; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76
+; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75
+; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74
+; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73
+; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72
+; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63
+; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62
+; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61
+; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60
+; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59
+; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58
+; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57
+; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56
+; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47
+; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s16
@@ -27786,22 +27959,22 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v11, s25
; GFX9-NEXT: v_mov_b32_e32 v12, s26
; GFX9-NEXT: v_mov_b32_e32 v13, s27
-; GFX9-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-NEXT: v_mov_b32_e32 v16, s8
-; GFX9-NEXT: v_mov_b32_e32 v17, s9
-; GFX9-NEXT: v_mov_b32_e32 v18, s10
-; GFX9-NEXT: v_mov_b32_e32 v19, s11
-; GFX9-NEXT: v_mov_b32_e32 v20, s12
-; GFX9-NEXT: v_mov_b32_e32 v21, s13
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
-; GFX9-NEXT: v_mov_b32_e32 v23, s15
-; GFX9-NEXT: v_mov_b32_e32 v24, s28
-; GFX9-NEXT: v_mov_b32_e32 v25, s29
-; GFX9-NEXT: v_mov_b32_e32 v26, s40
-; GFX9-NEXT: v_mov_b32_e32 v27, s41
-; GFX9-NEXT: v_mov_b32_e32 v28, s42
-; GFX9-NEXT: v_mov_b32_e32 v29, s43
+; GFX9-NEXT: v_mov_b32_e32 v14, s7
+; GFX9-NEXT: v_mov_b32_e32 v15, s8
+; GFX9-NEXT: v_mov_b32_e32 v16, s9
+; GFX9-NEXT: v_mov_b32_e32 v17, s10
+; GFX9-NEXT: v_mov_b32_e32 v18, s11
+; GFX9-NEXT: v_mov_b32_e32 v19, s12
+; GFX9-NEXT: v_mov_b32_e32 v20, s13
+; GFX9-NEXT: v_mov_b32_e32 v21, s14
+; GFX9-NEXT: v_mov_b32_e32 v22, s15
+; GFX9-NEXT: v_mov_b32_e32 v23, s28
+; GFX9-NEXT: v_mov_b32_e32 v24, s29
+; GFX9-NEXT: v_mov_b32_e32 v25, s40
+; GFX9-NEXT: v_mov_b32_e32 v26, s41
+; GFX9-NEXT: v_mov_b32_e32 v27, s42
+; GFX9-NEXT: v_mov_b32_e32 v28, s43
+; GFX9-NEXT: v_mov_b32_e32 v29, s6
; GFX9-NEXT: v_readlane_b32 s35, v30, 3
; GFX9-NEXT: v_readlane_b32 s34, v30, 2
; GFX9-NEXT: v_readlane_b32 s31, v30, 1
@@ -27842,7 +28015,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX9-NEXT: ; implicit-def: $sgpr56
; GFX9-NEXT: ; implicit-def: $sgpr47
; GFX9-NEXT: ; implicit-def: $sgpr46
-; GFX9-NEXT: s_branch .LBB41_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB41_2
+; GFX9-NEXT: s_branch .LBB41_3
;
; GFX11-LABEL: bitcast_v15i64_to_v60i16_scalar:
; GFX11: ; %bb.0:
@@ -27857,16 +28032,16 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-NEXT: v_readfirstlane_b32 s10, v6
; GFX11-NEXT: v_readfirstlane_b32 s11, v7
; GFX11-NEXT: v_readfirstlane_b32 s12, v8
-; GFX11-NEXT: v_readfirstlane_b32 s13, v9
+; GFX11-NEXT: v_readfirstlane_b32 s14, v9
; GFX11-NEXT: v_readfirstlane_b32 s15, v10
-; GFX11-NEXT: v_readfirstlane_b32 s14, v11
-; GFX11-NEXT: s_mov_b32 s94, 0
+; GFX11-NEXT: v_readfirstlane_b32 s13, v11
+; GFX11-NEXT: s_mov_b32 s94, -1
; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB41_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -27894,13 +28069,12 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-NEXT: s_lshr_b32 s91, s2, 16
; GFX11-NEXT: s_lshr_b32 s92, s1, 16
; GFX11-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
-; GFX11-NEXT: s_cbranch_vccnz .LBB41_3
+; GFX11-NEXT: s_cbranch_execnz .LBB41_3
; GFX11-NEXT: .LBB41_2: ; %cmp.true
; GFX11-NEXT: s_add_u32 s15, s15, 3
-; GFX11-NEXT: s_addc_u32 s14, s14, 0
-; GFX11-NEXT: s_add_u32 s12, s12, 3
; GFX11-NEXT: s_addc_u32 s13, s13, 0
+; GFX11-NEXT: s_add_u32 s12, s12, 3
+; GFX11-NEXT: s_addc_u32 s14, s14, 0
; GFX11-NEXT: s_add_u32 s10, s10, 3
; GFX11-NEXT: s_addc_u32 s11, s11, 0
; GFX11-NEXT: s_add_u32 s8, s8, 3
@@ -27927,9 +28101,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: s_add_u32 s0, s0, 3
; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -27986,9 +28160,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45
; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44
; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42
+; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42
; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40
+; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
@@ -28002,8 +28176,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7
; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9
; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11
-; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13
-; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
+; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14
+; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB41_4:
; GFX11-NEXT: ; implicit-def: $sgpr93
@@ -28036,7 +28210,9 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr41
; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: s_branch .LBB41_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-NEXT: s_cbranch_vccz .LBB41_2
+; GFX11-NEXT: s_branch .LBB41_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -29454,6 +29630,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v35, v22
; SI-NEXT: v_mov_b32_e32 v36, v20
; SI-NEXT: v_mov_b32_e32 v37, v18
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5
@@ -29485,7 +29662,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_and_b64 s[6:7], vcc, exec
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10
@@ -29788,7 +29965,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v30, v32
-; SI-NEXT: s_branch .LBB43_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB43_2
+; SI-NEXT: s_branch .LBB43_3
;
; VI-LABEL: bitcast_v60i16_to_v15i64_scalar:
; VI: ; %bb.0:
@@ -29808,6 +29987,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -29824,7 +30004,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB43_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -30071,11 +30251,28 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB43_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB43_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB43_2
+; VI-NEXT: s_branch .LBB43_3
;
; GFX9-LABEL: bitcast_v60i16_to_v15i64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -30092,21 +30289,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -30127,7 +30310,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -30142,6 +30324,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -30287,7 +30470,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB43_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB43_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB43_2
+; GFX9-NEXT: s_branch .LBB43_3
;
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -30332,41 +30517,41 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -30381,17 +30566,16 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3
; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -30405,24 +30589,24 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -30439,7 +30623,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB43_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB43_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB43_2
+; GFX11-TRUE16-NEXT: s_branch .LBB43_3
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -30472,41 +30658,41 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -30521,17 +30707,16 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3
; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -30545,24 +30730,24 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -30579,7 +30764,9 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB43_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB43_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB43_2
+; GFX11-FAKE16-NEXT: s_branch .LBB43_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -31974,6 +32161,7 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s44, v1
; SI-NEXT: v_readfirstlane_b32 s45, v2
; SI-NEXT: v_readfirstlane_b32 s42, v3
@@ -31986,11 +32174,11 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: v_readfirstlane_b32 s13, v10
; SI-NEXT: v_readfirstlane_b32 s10, v11
; SI-NEXT: v_readfirstlane_b32 s11, v12
-; SI-NEXT: v_readfirstlane_b32 s7, v13
-; SI-NEXT: v_readfirstlane_b32 s8, v14
+; SI-NEXT: v_readfirstlane_b32 s8, v13
+; SI-NEXT: v_readfirstlane_b32 s9, v14
; SI-NEXT: v_readfirstlane_b32 s6, v15
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
-; SI-NEXT: v_readfirstlane_b32 s9, v16
+; SI-NEXT: v_readfirstlane_b32 s7, v16
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
@@ -32006,13 +32194,13 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB45_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: s_lshr_b32 s4, s9, 16
+; SI-NEXT: s_lshr_b32 s4, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
; SI-NEXT: s_lshr_b32 s4, s6, 16
; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
-; SI-NEXT: s_lshr_b32 s4, s8, 16
+; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: s_lshr_b32 s4, s7, 16
+; SI-NEXT: s_lshr_b32 s4, s8, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
; SI-NEXT: s_lshr_b32 s4, s11, 16
; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
@@ -32070,10 +32258,10 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: s_lshr_b32 s4, s16, 16
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v60, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s7
; SI-NEXT: v_cvt_f32_f16_e32 v6, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s8
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s9
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s8
; SI-NEXT: v_cvt_f32_f16_e32 v12, s11
; SI-NEXT: v_cvt_f32_f16_e32 v14, s10
; SI-NEXT: v_cvt_f32_f16_e32 v16, s13
@@ -32154,18 +32342,18 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: s_addc_u32 s11, s11, 0
; SI-NEXT: s_lshr_b32 s92, s10, 16
; SI-NEXT: s_lshr_b32 s93, s11, 16
-; SI-NEXT: s_add_u32 s7, s7, 3
-; SI-NEXT: s_addc_u32 s8, s8, 0
-; SI-NEXT: s_lshr_b32 s94, s7, 16
-; SI-NEXT: s_lshr_b32 s95, s8, 16
-; SI-NEXT: s_add_u32 s6, s6, 3
+; SI-NEXT: s_add_u32 s8, s8, 3
; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: s_lshr_b32 s94, s8, 16
+; SI-NEXT: s_lshr_b32 s95, s9, 16
+; SI-NEXT: s_add_u32 s6, s6, 3
+; SI-NEXT: s_addc_u32 s7, s7, 0
; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16
-; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT: s_lshr_b32 vcc_hi, s7, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s7
; SI-NEXT: v_cvt_f32_f16_e32 v6, s6
-; SI-NEXT: v_cvt_f32_f16_e32 v8, s8
-; SI-NEXT: v_cvt_f32_f16_e32 v10, s7
+; SI-NEXT: v_cvt_f32_f16_e32 v8, s9
+; SI-NEXT: v_cvt_f32_f16_e32 v10, s8
; SI-NEXT: v_cvt_f32_f16_e32 v12, s11
; SI-NEXT: v_cvt_f32_f16_e32 v14, s10
; SI-NEXT: v_cvt_f32_f16_e32 v16, s13
@@ -32512,7 +32700,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: s_branch .LBB45_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB45_2
+; SI-NEXT: s_branch .LBB45_3
;
; VI-LABEL: bitcast_v15i64_to_v60f16_scalar:
; VI: ; %bb.0:
@@ -32525,8 +32715,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_writelane_b32 v30, s34, 2
; VI-NEXT: v_writelane_b32 v30, s35, 3
; VI-NEXT: v_writelane_b32 v30, s36, 4
-; VI-NEXT: v_writelane_b32 v30, s37, 5
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: v_writelane_b32 v30, s37, 5
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_writelane_b32 v30, s38, 6
; VI-NEXT: v_readfirstlane_b32 s45, v0
; VI-NEXT: v_readfirstlane_b32 s44, v1
@@ -32542,14 +32733,14 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_readfirstlane_b32 s10, v11
; VI-NEXT: v_readfirstlane_b32 s9, v12
; VI-NEXT: v_readfirstlane_b32 s8, v13
-; VI-NEXT: v_readfirstlane_b32 s6, v14
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
-; VI-NEXT: v_readfirstlane_b32 s7, v15
+; VI-NEXT: v_readfirstlane_b32 s7, v14
+; VI-NEXT: v_readfirstlane_b32 s6, v15
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_writelane_b32 v30, s39, 7
; VI-NEXT: s_cbranch_scc0 .LBB45_4
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -32580,8 +32771,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: s_lshr_b32 s39, s16, 16
; VI-NEXT: s_cbranch_execnz .LBB45_3
; VI-NEXT: .LBB45_2: ; %cmp.true
-; VI-NEXT: s_add_u32 s6, s6, 3
-; VI-NEXT: s_addc_u32 s7, s7, 0
+; VI-NEXT: s_add_u32 s7, s7, 3
+; VI-NEXT: s_addc_u32 s6, s6, 0
; VI-NEXT: s_add_u32 s9, s9, 3
; VI-NEXT: s_addc_u32 s8, s8, 0
; VI-NEXT: s_add_u32 s11, s11, 3
@@ -32610,8 +32801,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: s_addc_u32 s19, s19, 0
; VI-NEXT: s_add_u32 s16, s16, 3
; VI-NEXT: s_addc_u32 s17, s17, 0
-; VI-NEXT: s_lshr_b32 s46, s7, 16
-; VI-NEXT: s_lshr_b32 s47, s6, 16
+; VI-NEXT: s_lshr_b32 s46, s6, 16
+; VI-NEXT: s_lshr_b32 s47, s7, 16
; VI-NEXT: s_lshr_b32 s56, s8, 16
; VI-NEXT: s_lshr_b32 s57, s9, 16
; VI-NEXT: s_lshr_b32 s58, s10, 16
@@ -32725,12 +32916,12 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: s_and_b32 s8, 0xffff, s8
; VI-NEXT: s_lshl_b32 s44, s56, 16
; VI-NEXT: s_or_b32 s8, s8, s44
-; VI-NEXT: s_and_b32 s6, 0xffff, s6
-; VI-NEXT: s_lshl_b32 s44, s47, 16
-; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: s_and_b32 s7, 0xffff, s7
-; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_lshl_b32 s44, s47, 16
; VI-NEXT: s_or_b32 s7, s7, s44
+; VI-NEXT: s_and_b32 s6, 0xffff, s6
+; VI-NEXT: s_lshl_b32 s44, s46, 16
+; VI-NEXT: s_or_b32 s6, s6, s44
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s16
@@ -32759,8 +32950,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v25, s10
; VI-NEXT: v_mov_b32_e32 v26, s9
; VI-NEXT: v_mov_b32_e32 v27, s8
-; VI-NEXT: v_mov_b32_e32 v28, s6
-; VI-NEXT: v_mov_b32_e32 v29, s7
+; VI-NEXT: v_mov_b32_e32 v28, s7
+; VI-NEXT: v_mov_b32_e32 v29, s6
; VI-NEXT: v_readlane_b32 s39, v30, 7
; VI-NEXT: v_readlane_b32 s38, v30, 6
; VI-NEXT: v_readlane_b32 s37, v30, 5
@@ -32805,7 +32996,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; VI-NEXT: ; implicit-def: $sgpr56
; VI-NEXT: ; implicit-def: $sgpr47
; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: s_branch .LBB45_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB45_2
+; VI-NEXT: s_branch .LBB45_3
;
; GFX9-LABEL: bitcast_v15i64_to_v60f16_scalar:
; GFX9: ; %bb.0:
@@ -32814,45 +33007,46 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v30, s30, 0
-; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_writelane_b32 v30, s34, 2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s7, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9-NEXT: v_readfirstlane_b32 s10, v4
-; GFX9-NEXT: v_readfirstlane_b32 s11, v5
-; GFX9-NEXT: v_readfirstlane_b32 s12, v6
-; GFX9-NEXT: v_readfirstlane_b32 s13, v7
-; GFX9-NEXT: v_readfirstlane_b32 s14, v8
-; GFX9-NEXT: v_readfirstlane_b32 s15, v9
-; GFX9-NEXT: v_readfirstlane_b32 s40, v10
-; GFX9-NEXT: v_readfirstlane_b32 s41, v11
-; GFX9-NEXT: v_readfirstlane_b32 s42, v12
-; GFX9-NEXT: v_readfirstlane_b32 s43, v13
-; GFX9-NEXT: v_readfirstlane_b32 s44, v14
+; GFX9-NEXT: v_writelane_b32 v30, s31, 1
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT: v_readfirstlane_b32 s45, v15
+; GFX9-NEXT: v_writelane_b32 v30, s34, 2
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s9, v2
+; GFX9-NEXT: v_readfirstlane_b32 s10, v3
+; GFX9-NEXT: v_readfirstlane_b32 s11, v4
+; GFX9-NEXT: v_readfirstlane_b32 s12, v5
+; GFX9-NEXT: v_readfirstlane_b32 s13, v6
+; GFX9-NEXT: v_readfirstlane_b32 s14, v7
+; GFX9-NEXT: v_readfirstlane_b32 s15, v8
+; GFX9-NEXT: v_readfirstlane_b32 s40, v9
+; GFX9-NEXT: v_readfirstlane_b32 s41, v10
+; GFX9-NEXT: v_readfirstlane_b32 s42, v11
+; GFX9-NEXT: v_readfirstlane_b32 s43, v12
+; GFX9-NEXT: v_readfirstlane_b32 s44, v13
+; GFX9-NEXT: v_readfirstlane_b32 s45, v14
+; GFX9-NEXT: v_readfirstlane_b32 s6, v15
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_writelane_b32 v30, s35, 3
; GFX9-NEXT: s_cbranch_scc0 .LBB45_4
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -32869,22 +33063,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s35, s16, 16
; GFX9-NEXT: s_cbranch_execnz .LBB45_3
; GFX9-NEXT: .LBB45_2: ; %cmp.true
-; GFX9-NEXT: s_add_u32 s44, s44, 3
-; GFX9-NEXT: s_addc_u32 s45, s45, 0
-; GFX9-NEXT: s_add_u32 s42, s42, 3
-; GFX9-NEXT: s_addc_u32 s43, s43, 0
-; GFX9-NEXT: s_add_u32 s40, s40, 3
-; GFX9-NEXT: s_addc_u32 s41, s41, 0
-; GFX9-NEXT: s_add_u32 s14, s14, 3
-; GFX9-NEXT: s_addc_u32 s15, s15, 0
-; GFX9-NEXT: s_add_u32 s12, s12, 3
-; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_add_u32 s10, s10, 3
-; GFX9-NEXT: s_addc_u32 s11, s11, 0
-; GFX9-NEXT: s_add_u32 s8, s8, 3
-; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_add_u32 s6, s6, 3
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
+; GFX9-NEXT: s_add_u32 s45, s45, 3
+; GFX9-NEXT: s_addc_u32 s6, s6, 0
+; GFX9-NEXT: s_add_u32 s43, s43, 3
+; GFX9-NEXT: s_addc_u32 s44, s44, 0
+; GFX9-NEXT: s_add_u32 s41, s41, 3
+; GFX9-NEXT: s_addc_u32 s42, s42, 0
+; GFX9-NEXT: s_add_u32 s15, s15, 3
+; GFX9-NEXT: s_addc_u32 s40, s40, 0
+; GFX9-NEXT: s_add_u32 s13, s13, 3
+; GFX9-NEXT: s_addc_u32 s14, s14, 0
+; GFX9-NEXT: s_add_u32 s11, s11, 3
+; GFX9-NEXT: s_addc_u32 s12, s12, 0
+; GFX9-NEXT: s_add_u32 s9, s9, 3
+; GFX9-NEXT: s_addc_u32 s10, s10, 0
+; GFX9-NEXT: s_add_u32 s7, s7, 3
+; GFX9-NEXT: s_addc_u32 s8, s8, 0
; GFX9-NEXT: s_add_u32 s28, s28, 3
; GFX9-NEXT: s_addc_u32 s29, s29, 0
; GFX9-NEXT: s_add_u32 s26, s26, 3
@@ -32899,22 +33093,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: s_addc_u32 s19, s19, 0
; GFX9-NEXT: s_add_u32 s16, s16, 3
; GFX9-NEXT: s_addc_u32 s17, s17, 0
-; GFX9-NEXT: s_lshr_b32 s46, s45, 16
-; GFX9-NEXT: s_lshr_b32 s47, s44, 16
-; GFX9-NEXT: s_lshr_b32 s56, s43, 16
-; GFX9-NEXT: s_lshr_b32 s57, s42, 16
-; GFX9-NEXT: s_lshr_b32 s58, s41, 16
-; GFX9-NEXT: s_lshr_b32 s59, s40, 16
-; GFX9-NEXT: s_lshr_b32 s60, s15, 16
-; GFX9-NEXT: s_lshr_b32 s61, s14, 16
-; GFX9-NEXT: s_lshr_b32 s62, s13, 16
-; GFX9-NEXT: s_lshr_b32 s63, s12, 16
-; GFX9-NEXT: s_lshr_b32 s72, s11, 16
-; GFX9-NEXT: s_lshr_b32 s73, s10, 16
-; GFX9-NEXT: s_lshr_b32 s74, s9, 16
-; GFX9-NEXT: s_lshr_b32 s75, s8, 16
-; GFX9-NEXT: s_lshr_b32 s76, s7, 16
-; GFX9-NEXT: s_lshr_b32 s77, s6, 16
+; GFX9-NEXT: s_lshr_b32 s46, s6, 16
+; GFX9-NEXT: s_lshr_b32 s47, s45, 16
+; GFX9-NEXT: s_lshr_b32 s56, s44, 16
+; GFX9-NEXT: s_lshr_b32 s57, s43, 16
+; GFX9-NEXT: s_lshr_b32 s58, s42, 16
+; GFX9-NEXT: s_lshr_b32 s59, s41, 16
+; GFX9-NEXT: s_lshr_b32 s60, s40, 16
+; GFX9-NEXT: s_lshr_b32 s61, s15, 16
+; GFX9-NEXT: s_lshr_b32 s62, s14, 16
+; GFX9-NEXT: s_lshr_b32 s63, s13, 16
+; GFX9-NEXT: s_lshr_b32 s72, s12, 16
+; GFX9-NEXT: s_lshr_b32 s73, s11, 16
+; GFX9-NEXT: s_lshr_b32 s74, s10, 16
+; GFX9-NEXT: s_lshr_b32 s75, s9, 16
+; GFX9-NEXT: s_lshr_b32 s76, s8, 16
+; GFX9-NEXT: s_lshr_b32 s77, s7, 16
; GFX9-NEXT: s_lshr_b32 s78, s29, 16
; GFX9-NEXT: s_lshr_b32 s79, s28, 16
; GFX9-NEXT: s_lshr_b32 s88, s27, 16
@@ -32944,22 +33138,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88
; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79
; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78
-; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77
-; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76
-; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75
-; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74
-; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73
-; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72
-; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63
-; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62
-; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61
-; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60
-; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59
-; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58
-; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57
-; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56
-; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47
-; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46
+; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s77
+; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s76
+; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s75
+; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s74
+; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s73
+; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s72
+; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s63
+; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s62
+; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s61
+; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s60
+; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s59
+; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s58
+; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s57
+; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s56
+; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s47
+; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s46
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s16
@@ -32974,22 +33168,22 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v11, s25
; GFX9-NEXT: v_mov_b32_e32 v12, s26
; GFX9-NEXT: v_mov_b32_e32 v13, s27
-; GFX9-NEXT: v_mov_b32_e32 v14, s6
-; GFX9-NEXT: v_mov_b32_e32 v15, s7
-; GFX9-NEXT: v_mov_b32_e32 v16, s8
-; GFX9-NEXT: v_mov_b32_e32 v17, s9
-; GFX9-NEXT: v_mov_b32_e32 v18, s10
-; GFX9-NEXT: v_mov_b32_e32 v19, s11
-; GFX9-NEXT: v_mov_b32_e32 v20, s12
-; GFX9-NEXT: v_mov_b32_e32 v21, s13
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
-; GFX9-NEXT: v_mov_b32_e32 v23, s15
-; GFX9-NEXT: v_mov_b32_e32 v24, s28
-; GFX9-NEXT: v_mov_b32_e32 v25, s29
-; GFX9-NEXT: v_mov_b32_e32 v26, s40
-; GFX9-NEXT: v_mov_b32_e32 v27, s41
-; GFX9-NEXT: v_mov_b32_e32 v28, s42
-; GFX9-NEXT: v_mov_b32_e32 v29, s43
+; GFX9-NEXT: v_mov_b32_e32 v14, s7
+; GFX9-NEXT: v_mov_b32_e32 v15, s8
+; GFX9-NEXT: v_mov_b32_e32 v16, s9
+; GFX9-NEXT: v_mov_b32_e32 v17, s10
+; GFX9-NEXT: v_mov_b32_e32 v18, s11
+; GFX9-NEXT: v_mov_b32_e32 v19, s12
+; GFX9-NEXT: v_mov_b32_e32 v20, s13
+; GFX9-NEXT: v_mov_b32_e32 v21, s14
+; GFX9-NEXT: v_mov_b32_e32 v22, s15
+; GFX9-NEXT: v_mov_b32_e32 v23, s28
+; GFX9-NEXT: v_mov_b32_e32 v24, s29
+; GFX9-NEXT: v_mov_b32_e32 v25, s40
+; GFX9-NEXT: v_mov_b32_e32 v26, s41
+; GFX9-NEXT: v_mov_b32_e32 v27, s42
+; GFX9-NEXT: v_mov_b32_e32 v28, s43
+; GFX9-NEXT: v_mov_b32_e32 v29, s6
; GFX9-NEXT: v_readlane_b32 s35, v30, 3
; GFX9-NEXT: v_readlane_b32 s34, v30, 2
; GFX9-NEXT: v_readlane_b32 s31, v30, 1
@@ -33030,7 +33224,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX9-NEXT: ; implicit-def: $sgpr56
; GFX9-NEXT: ; implicit-def: $sgpr47
; GFX9-NEXT: ; implicit-def: $sgpr46
-; GFX9-NEXT: s_branch .LBB45_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB45_2
+; GFX9-NEXT: s_branch .LBB45_3
;
; GFX11-LABEL: bitcast_v15i64_to_v60f16_scalar:
; GFX11: ; %bb.0:
@@ -33045,16 +33241,16 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-NEXT: v_readfirstlane_b32 s10, v6
; GFX11-NEXT: v_readfirstlane_b32 s11, v7
; GFX11-NEXT: v_readfirstlane_b32 s12, v8
-; GFX11-NEXT: v_readfirstlane_b32 s13, v9
+; GFX11-NEXT: v_readfirstlane_b32 s14, v9
; GFX11-NEXT: v_readfirstlane_b32 s15, v10
-; GFX11-NEXT: v_readfirstlane_b32 s14, v11
-; GFX11-NEXT: s_mov_b32 s94, 0
+; GFX11-NEXT: v_readfirstlane_b32 s13, v11
+; GFX11-NEXT: s_mov_b32 s94, -1
; GFX11-NEXT: s_and_b32 s40, vcc_lo, exec_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB45_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -33082,13 +33278,12 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-NEXT: s_lshr_b32 s91, s2, 16
; GFX11-NEXT: s_lshr_b32 s92, s1, 16
; GFX11-NEXT: s_lshr_b32 s93, s0, 16
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
-; GFX11-NEXT: s_cbranch_vccnz .LBB45_3
+; GFX11-NEXT: s_cbranch_execnz .LBB45_3
; GFX11-NEXT: .LBB45_2: ; %cmp.true
; GFX11-NEXT: s_add_u32 s15, s15, 3
-; GFX11-NEXT: s_addc_u32 s14, s14, 0
-; GFX11-NEXT: s_add_u32 s12, s12, 3
; GFX11-NEXT: s_addc_u32 s13, s13, 0
+; GFX11-NEXT: s_add_u32 s12, s12, 3
+; GFX11-NEXT: s_addc_u32 s14, s14, 0
; GFX11-NEXT: s_add_u32 s10, s10, 3
; GFX11-NEXT: s_addc_u32 s11, s11, 0
; GFX11-NEXT: s_add_u32 s8, s8, 3
@@ -33115,9 +33310,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: s_add_u32 s0, s0, 3
; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_lshr_b32 s40, s14, 16
+; GFX11-NEXT: s_lshr_b32 s40, s13, 16
; GFX11-NEXT: s_lshr_b32 s41, s15, 16
-; GFX11-NEXT: s_lshr_b32 s42, s13, 16
+; GFX11-NEXT: s_lshr_b32 s42, s14, 16
; GFX11-NEXT: s_lshr_b32 s43, s12, 16
; GFX11-NEXT: s_lshr_b32 s44, s11, 16
; GFX11-NEXT: s_lshr_b32 s45, s10, 16
@@ -33174,9 +33369,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-NEXT: s_pack_ll_b32_b16 s10, s10, s45
; GFX11-NEXT: s_pack_ll_b32_b16 s11, s11, s44
; GFX11-NEXT: s_pack_ll_b32_b16 s12, s12, s43
-; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s42
+; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s42
; GFX11-NEXT: s_pack_ll_b32_b16 s15, s15, s41
-; GFX11-NEXT: s_pack_ll_b32_b16 s14, s14, s40
+; GFX11-NEXT: s_pack_ll_b32_b16 s13, s13, s40
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
@@ -33190,8 +33385,8 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7
; GFX11-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9
; GFX11-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11
-; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13
-; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
+; GFX11-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s14
+; GFX11-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s13
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-NEXT: .LBB45_4:
; GFX11-NEXT: ; implicit-def: $sgpr93
@@ -33224,7 +33419,9 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr41
; GFX11-NEXT: ; implicit-def: $sgpr40
-; GFX11-NEXT: s_branch .LBB45_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94
+; GFX11-NEXT: s_cbranch_vccz .LBB45_2
+; GFX11-NEXT: s_branch .LBB45_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -34835,11 +35032,11 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
@@ -34853,83 +35050,92 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v2
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v12
; SI-NEXT: v_cvt_f16_f32_e32 v41, v15
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v55, v14
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
; SI-NEXT: v_cvt_f16_f32_e32 v15, v17
; SI-NEXT: v_cvt_f16_f32_e32 v61, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v18
; SI-NEXT: v_cvt_f16_f32_e32 v17, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v20
; SI-NEXT: v_cvt_f16_f32_e32 v18, v23
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v19, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
; SI-NEXT: v_cvt_f16_f32_e32 v20, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v12, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v9, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v8, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v7, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
-; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v54
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v50, s28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v42
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v45
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -34938,260 +35144,240 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v59
; SI-NEXT: v_cvt_f16_f32_e32 v59, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB47_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v21, v19
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v22, v18
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
-; SI-NEXT: v_mov_b32_e32 v33, v32
-; SI-NEXT: v_or_b32_e32 v10, v32, v10
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_or_b32_e32 v13, v43, v13
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v8, v5
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49
-; SI-NEXT: v_or_b32_e32 v7, v37, v7
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v6, v50, v6
+; SI-NEXT: v_mov_b32_e32 v30, v50
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
-; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_or_b32_e32 v4, v9, v4
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_mov_b32_e32 v52, v12
+; SI-NEXT: v_or_b32_e32 v0, v12, v0
+; SI-NEXT: v_or_b32_e32 v1, v11, v1
+; SI-NEXT: v_or_b32_e32 v2, v9, v2
+; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: v_or_b32_e32 v4, v8, v4
+; SI-NEXT: v_or_b32_e32 v5, v7, v5
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_or_b32_e32 v18, v22, v18
-; SI-NEXT: v_or_b32_e32 v19, v21, v19
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48
-; SI-NEXT: v_or_b32_e32 v0, v58, v0
-; SI-NEXT: v_mov_b32_e32 v56, v34
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_or_b32_e32 v8, v49, v8
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
+; SI-NEXT: v_or_b32_e32 v10, v37, v10
+; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v45, v35
; SI-NEXT: v_or_b32_e32 v11, v35, v11
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
-; SI-NEXT: v_or_b32_e32 v12, v62, v12
+; SI-NEXT: v_or_b32_e32 v12, v63, v12
+; SI-NEXT: v_mov_b32_e32 v43, v62
+; SI-NEXT: v_or_b32_e32 v13, v62, v13
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
; SI-NEXT: v_or_b32_e32 v14, v55, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
-; SI-NEXT: v_or_b32_e32 v20, v53, v20
-; SI-NEXT: v_or_b32_e32 v21, v51, v21
-; SI-NEXT: v_or_b32_e32 v22, v30, v22
-; SI-NEXT: v_or_b32_e32 v23, v31, v23
+; SI-NEXT: v_or_b32_e32 v16, v53, v16
+; SI-NEXT: v_or_b32_e32 v17, v51, v17
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_or_b32_e32 v20, v21, v20
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v22, v21
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v23, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v24, v23
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_or_b32_e32 v24, v25, v24
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_or_b32_e32 v17, v32, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_or_b32_e32 v25, v26, v25
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v16, v43, v16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; SI-NEXT: v_or_b32_e32 v26, v27, v26
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v35, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_or_b32_e32 v27, v28, v27
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
-; SI-NEXT: v_or_b32_e32 v9, v39, v9
-; SI-NEXT: v_mov_b32_e32 v36, v37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_or_b32_e32 v28, v29, v28
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v8, v38, v8
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_or_b32_e32 v7, v31, v7
+; SI-NEXT: v_mov_b32_e32 v35, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_or_b32_e32 v29, v54, v29
-; SI-NEXT: v_mov_b32_e32 v54, v32
; SI-NEXT: s_branch .LBB47_3
; SI-NEXT: .LBB47_2:
-; SI-NEXT: v_mov_b32_e32 v54, v53
-; SI-NEXT: v_mov_b32_e32 v53, v52
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v30
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v31
+; SI-NEXT: v_mov_b32_e32 v52, v12
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_mov_b32_e32 v30, v50
-; SI-NEXT: v_mov_b32_e32 v50, v51
-; SI-NEXT: v_mov_b32_e32 v51, v52
-; SI-NEXT: v_mov_b32_e32 v52, v53
-; SI-NEXT: v_mov_b32_e32 v53, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_mov_b32_e32 v56, v34
+; SI-NEXT: v_mov_b32_e32 v45, v35
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
+; SI-NEXT: v_mov_b32_e32 v43, v62
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: v_mov_b32_e32 v31, v48
-; SI-NEXT: v_mov_b32_e32 v48, v49
; SI-NEXT: .LBB47_3: ; %Flow
; SI-NEXT: v_mov_b32_e32 v32, v33
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: v_mov_b32_e32 v61, v40
-; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB47_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v50
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v31
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v45
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v45
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v43
; SI-NEXT: v_mov_b32_e32 v55, v42
; SI-NEXT: v_cvt_f32_f16_e32 v15, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
+; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v48
-; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v51
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -35199,42 +35385,48 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -35247,65 +35439,65 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v35
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v57
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v47
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v44
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v13, v12
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
@@ -35317,14 +35509,14 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_or_b32_e32 v15, v16, v15
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
@@ -35332,9 +35524,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_or_b32_e32 v16, v17, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v17, v19, v17
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
@@ -35344,7 +35536,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_or_b32_e32 v18, v20, v18
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
@@ -35356,32 +35548,39 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_or_b32_e32 v19, v20, v19
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v52
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_or_b32_e32 v20, v22, v20
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v50
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v21, v23, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v30
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_or_b32_e32 v22, v23, v22
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_or_b32_e32 v23, v25, v23
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
@@ -35466,6 +35665,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -35482,7 +35682,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB47_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -35686,11 +35886,28 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB47_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB47_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB47_2
+; VI-NEXT: s_branch .LBB47_3
;
; GFX9-LABEL: bitcast_v60f16_to_v15i64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -35707,21 +35924,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -35742,7 +35945,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -35757,6 +35959,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -35904,7 +36107,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB47_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB47_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB47_2
+; GFX9-NEXT: s_branch .LBB47_3
;
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -35949,41 +36154,41 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -35998,17 +36203,16 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB47_3
; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -36022,24 +36226,24 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -36056,7 +36260,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB47_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB47_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB47_2
+; GFX11-TRUE16-NEXT: s_branch .LBB47_3
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -36089,41 +36295,41 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -36138,17 +36344,16 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB47_3
; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -36162,24 +36367,24 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -36196,7 +36401,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB47_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB47_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB47_2
+; GFX11-FAKE16-NEXT: s_branch .LBB47_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -37142,6 +37349,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v27, s16
; SI-NEXT: v_mov_b32_e32 v28, s17
; SI-NEXT: v_mov_b32_e32 v29, s18
@@ -37154,9 +37362,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v22, s25
; SI-NEXT: v_mov_b32_e32 v19, s26
; SI-NEXT: v_mov_b32_e32 v20, s27
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
; SI-NEXT: v_mov_b32_e32 v17, s28
; SI-NEXT: v_mov_b32_e32 v18, s29
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
@@ -37487,12 +37695,15 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; SI-NEXT: ; implicit-def: $vgpr48
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; implicit-def: $vgpr38
-; SI-NEXT: s_branch .LBB49_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB49_2
+; SI-NEXT: s_branch .LBB49_3
;
; VI-LABEL: bitcast_v15f64_to_v60i16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v17, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
; VI-NEXT: v_mov_b32_e32 v29, s18
@@ -37505,9 +37716,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; VI-NEXT: v_mov_b32_e32 v24, s25
; VI-NEXT: v_mov_b32_e32 v21, s26
; VI-NEXT: v_mov_b32_e32 v22, s27
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v19, s28
; VI-NEXT: v_mov_b32_e32 v20, s29
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -37721,12 +37932,15 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: s_branch .LBB49_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB49_2
+; VI-NEXT: s_branch .LBB49_3
;
; GFX9-LABEL: bitcast_v15f64_to_v60i16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v17, s16
; GFX9-NEXT: v_mov_b32_e32 v18, s17
; GFX9-NEXT: v_mov_b32_e32 v29, s18
@@ -37739,9 +37953,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v24, s25
; GFX9-NEXT: v_mov_b32_e32 v21, s26
; GFX9-NEXT: v_mov_b32_e32 v22, s27
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v19, s28
; GFX9-NEXT: v_mov_b32_e32 v20, s29
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -37955,7 +38169,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: s_branch .LBB49_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB49_2
+; GFX9-NEXT: s_branch .LBB49_3
;
; GFX11-LABEL: bitcast_v15f64_to_v60i16_scalar:
; GFX11: ; %bb.0:
@@ -37970,8 +38186,8 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_cbranch_scc0 .LBB49_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
@@ -38004,8 +38220,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB49_3
+; GFX11-NEXT: s_cbranch_execnz .LBB49_3
; GFX11-NEXT: .LBB49_2: ; %cmp.true
; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
@@ -38154,7 +38369,9 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
; GFX11-NEXT: ; implicit-def: $vgpr52
; GFX11-NEXT: ; implicit-def: $vgpr51
; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB49_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_vccz .LBB49_2
+; GFX11-NEXT: s_branch .LBB49_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -39572,6 +39789,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; SI-NEXT: v_mov_b32_e32 v35, v22
; SI-NEXT: v_mov_b32_e32 v36, v20
; SI-NEXT: v_mov_b32_e32 v37, v18
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5
@@ -39603,7 +39821,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4
-; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_and_b64 s[6:7], vcc, exec
; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10
@@ -39906,7 +40124,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; SI-NEXT: v_mov_b32_e32 v30, v32
-; SI-NEXT: s_branch .LBB51_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz .LBB51_2
+; SI-NEXT: s_branch .LBB51_3
;
; VI-LABEL: bitcast_v60i16_to_v15f64_scalar:
; VI: ; %bb.0:
@@ -39926,6 +40146,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -39942,7 +40163,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB51_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -40189,11 +40410,28 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB51_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB51_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB51_2
+; VI-NEXT: s_branch .LBB51_3
;
; GFX9-LABEL: bitcast_v60i16_to_v15f64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -40210,21 +40448,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -40245,7 +40469,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -40260,6 +40483,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -40405,7 +40629,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB51_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB51_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB51_2
+; GFX9-NEXT: s_branch .LBB51_3
;
; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -40450,41 +40676,41 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -40499,17 +40725,16 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_3
; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -40523,24 +40748,24 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -40557,7 +40782,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB51_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB51_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB51_2
+; GFX11-TRUE16-NEXT: s_branch .LBB51_3
;
; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -40590,41 +40817,41 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -40639,17 +40866,16 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_3
; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -40663,24 +40889,24 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -40697,7 +40923,9 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB51_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB51_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB51_2
+; GFX11-FAKE16-NEXT: s_branch .LBB51_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -41991,6 +42219,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; SI-NEXT: s_and_b64 s[44:45], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s42, v1
; SI-NEXT: v_readfirstlane_b32 s43, v2
; SI-NEXT: v_readfirstlane_b32 s40, v3
@@ -42006,8 +42235,8 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: v_readfirstlane_b32 s6, v13
; SI-NEXT: v_readfirstlane_b32 s7, v14
; SI-NEXT: v_readfirstlane_b32 s4, v15
-; SI-NEXT: s_and_b64 s[44:45], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v16
+; SI-NEXT: s_mov_b64 s[44:45], -1
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -42542,12 +42771,15 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; SI-NEXT: ; implicit-def: $vgpr36
; SI-NEXT: ; implicit-def: $vgpr14
; SI-NEXT: ; implicit-def: $vgpr59
-; SI-NEXT: s_branch .LBB53_2
+; SI-NEXT: s_andn2_b64 vcc, exec, s[44:45]
+; SI-NEXT: s_cbranch_vccz .LBB53_2
+; SI-NEXT: s_branch .LBB53_3
;
; VI-LABEL: bitcast_v15f64_to_v60f16_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v17, s16
; VI-NEXT: v_mov_b32_e32 v18, s17
; VI-NEXT: v_mov_b32_e32 v29, s18
@@ -42560,9 +42792,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; VI-NEXT: v_mov_b32_e32 v24, s25
; VI-NEXT: v_mov_b32_e32 v21, s26
; VI-NEXT: v_mov_b32_e32 v22, s27
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v19, s28
; VI-NEXT: v_mov_b32_e32 v20, s29
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -42776,12 +43008,15 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; VI-NEXT: ; implicit-def: $vgpr40
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: s_branch .LBB53_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB53_2
+; VI-NEXT: s_branch .LBB53_3
;
; GFX9-LABEL: bitcast_v15f64_to_v60f16_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v17, s16
; GFX9-NEXT: v_mov_b32_e32 v18, s17
; GFX9-NEXT: v_mov_b32_e32 v29, s18
@@ -42794,9 +43029,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v24, s25
; GFX9-NEXT: v_mov_b32_e32 v21, s26
; GFX9-NEXT: v_mov_b32_e32 v22, s27
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_mov_b32_e32 v19, s28
; GFX9-NEXT: v_mov_b32_e32 v20, s29
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -43010,7 +43245,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr55
; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: s_branch .LBB53_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB53_2
+; GFX9-NEXT: s_branch .LBB53_3
;
; GFX11-LABEL: bitcast_v15f64_to_v60f16_scalar:
; GFX11: ; %bb.0:
@@ -43025,8 +43262,8 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX11-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_cbranch_scc0 .LBB53_4
; GFX11-NEXT: ; %bb.1: ; %cmp.false
; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
@@ -43059,8 +43296,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v28
; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v31
; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_vccnz .LBB53_3
+; GFX11-NEXT: s_cbranch_execnz .LBB53_3
; GFX11-NEXT: .LBB53_2: ; %cmp.true
; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
@@ -43209,7 +43445,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
; GFX11-NEXT: ; implicit-def: $vgpr52
; GFX11-NEXT: ; implicit-def: $vgpr51
; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: s_branch .LBB53_2
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_vccz .LBB53_2
+; GFX11-NEXT: s_branch .LBB53_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -44820,11 +45058,11 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32
-; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24
+; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28
@@ -44838,83 +45076,92 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56
; SI-NEXT: s_waitcnt expcnt(3)
; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52
-; SI-NEXT: v_cvt_f16_f32_e32 v37, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v49, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v39, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v34, v7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v49, v2
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
-; SI-NEXT: v_cvt_f16_f32_e32 v32, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v36, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v26
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v48, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v63, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v62, v10
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT: v_cvt_f16_f32_e32 v38, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v37, v6
+; SI-NEXT: v_cvt_f16_f32_e32 v32, v9
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT: v_cvt_f16_f32_e32 v35, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v34, v11
+; SI-NEXT: v_cvt_f16_f32_e32 v63, v10
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT: v_cvt_f16_f32_e32 v43, v12
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v62, v12
; SI-NEXT: v_cvt_f16_f32_e32 v41, v15
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f16_f32_e32 v55, v14
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v20
; SI-NEXT: v_cvt_f16_f32_e32 v15, v17
; SI-NEXT: v_cvt_f16_f32_e32 v61, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v19
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT: v_cvt_f16_f32_e32 v53, v18
; SI-NEXT: v_cvt_f16_f32_e32 v17, v21
+; SI-NEXT: v_cvt_f16_f32_e32 v51, v20
; SI-NEXT: v_cvt_f16_f32_e32 v18, v23
; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_cvt_f16_f32_e32 v19, v25
; SI-NEXT: v_cvt_f16_f32_e32 v21, v24
; SI-NEXT: v_cvt_f16_f32_e32 v20, v27
-; SI-NEXT: v_cvt_f16_f32_e32 v53, v26
-; SI-NEXT: v_cvt_f16_f32_e32 v52, v29
-; SI-NEXT: v_cvt_f16_f32_e32 v51, v28
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v12, s16
; SI-NEXT: v_cvt_f16_f32_e32 v1, s19
-; SI-NEXT: v_cvt_f16_f32_e32 v12, s18
+; SI-NEXT: v_cvt_f16_f32_e32 v11, s18
; SI-NEXT: v_cvt_f16_f32_e32 v2, s21
-; SI-NEXT: v_cvt_f16_f32_e32 v11, s20
+; SI-NEXT: v_cvt_f16_f32_e32 v9, s20
; SI-NEXT: v_cvt_f16_f32_e32 v3, s23
; SI-NEXT: v_cvt_f16_f32_e32 v10, s22
; SI-NEXT: v_cvt_f16_f32_e32 v4, s25
-; SI-NEXT: v_cvt_f16_f32_e32 v9, s24
+; SI-NEXT: v_cvt_f16_f32_e32 v8, s24
; SI-NEXT: v_cvt_f16_f32_e32 v5, s27
-; SI-NEXT: v_cvt_f16_f32_e32 v8, s26
+; SI-NEXT: v_cvt_f16_f32_e32 v7, s26
; SI-NEXT: v_cvt_f16_f32_e32 v6, s29
-; SI-NEXT: v_cvt_f16_f32_e32 v7, s28
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT: v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT: v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT: v_cvt_f16_f32_e32 v31, v40
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v54
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v50
+; SI-NEXT: v_cvt_f16_f32_e32 v50, s28
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v40
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v33
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v42
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v43
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v44
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v45
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v46
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v47
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -44923,260 +45170,240 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v57
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT: v_cvt_f16_f32_e32 v58, s16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v59
; SI-NEXT: v_cvt_f16_f32_e32 v59, s17
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v60
; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; SI-NEXT: s_cbranch_scc0 .LBB55_2
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT: v_or_b32_e32 v19, v21, v19
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT: v_or_b32_e32 v18, v22, v18
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v10, v3
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34
-; SI-NEXT: v_mov_b32_e32 v33, v32
-; SI-NEXT: v_or_b32_e32 v10, v32, v10
-; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_or_b32_e32 v13, v43, v13
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v8, v5
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
-; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49
-; SI-NEXT: v_or_b32_e32 v7, v37, v7
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: v_or_b32_e32 v6, v50, v6
+; SI-NEXT: v_mov_b32_e32 v30, v50
+; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59
-; SI-NEXT: v_or_b32_e32 v1, v12, v1
-; SI-NEXT: v_or_b32_e32 v2, v11, v2
-; SI-NEXT: v_or_b32_e32 v4, v9, v4
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36
-; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63
+; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT: v_mov_b32_e32 v52, v12
+; SI-NEXT: v_or_b32_e32 v0, v12, v0
+; SI-NEXT: v_or_b32_e32 v1, v11, v1
+; SI-NEXT: v_or_b32_e32 v2, v9, v2
+; SI-NEXT: v_or_b32_e32 v3, v10, v3
+; SI-NEXT: v_or_b32_e32 v4, v8, v4
+; SI-NEXT: v_or_b32_e32 v5, v7, v5
+; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39
+; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v48
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT: v_or_b32_e32 v18, v22, v18
-; SI-NEXT: v_or_b32_e32 v19, v21, v19
-; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52
-; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50
-; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48
-; SI-NEXT: v_or_b32_e32 v0, v58, v0
-; SI-NEXT: v_mov_b32_e32 v56, v34
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_or_b32_e32 v8, v49, v8
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_or_b32_e32 v9, v36, v9
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
+; SI-NEXT: v_or_b32_e32 v10, v37, v10
+; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v45, v35
; SI-NEXT: v_or_b32_e32 v11, v35, v11
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
-; SI-NEXT: v_or_b32_e32 v12, v62, v12
+; SI-NEXT: v_or_b32_e32 v12, v63, v12
+; SI-NEXT: v_mov_b32_e32 v43, v62
+; SI-NEXT: v_or_b32_e32 v13, v62, v13
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
; SI-NEXT: v_or_b32_e32 v14, v55, v14
; SI-NEXT: v_or_b32_e32 v15, v61, v15
-; SI-NEXT: v_or_b32_e32 v20, v53, v20
-; SI-NEXT: v_or_b32_e32 v21, v51, v21
-; SI-NEXT: v_or_b32_e32 v22, v30, v22
-; SI-NEXT: v_or_b32_e32 v23, v31, v23
+; SI-NEXT: v_or_b32_e32 v16, v53, v16
+; SI-NEXT: v_or_b32_e32 v17, v51, v17
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt vmcnt(11)
+; SI-NEXT: v_or_b32_e32 v20, v21, v20
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT: v_or_b32_e32 v21, v22, v21
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT: v_or_b32_e32 v22, v23, v22
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT: v_or_b32_e32 v23, v24, v23
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT: s_waitcnt vmcnt(10)
; SI-NEXT: v_or_b32_e32 v24, v25, v24
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(10)
-; SI-NEXT: v_or_b32_e32 v17, v32, v17
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
; SI-NEXT: v_or_b32_e32 v25, v26, v25
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v16, v43, v16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; SI-NEXT: v_or_b32_e32 v26, v27, v26
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v35, v39
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; SI-NEXT: v_or_b32_e32 v27, v28, v27
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37
-; SI-NEXT: v_or_b32_e32 v9, v39, v9
-; SI-NEXT: v_mov_b32_e32 v36, v37
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
; SI-NEXT: v_or_b32_e32 v28, v29, v28
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT: v_or_b32_e32 v8, v38, v8
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50
+; SI-NEXT: v_or_b32_e32 v7, v31, v7
+; SI-NEXT: v_mov_b32_e32 v35, v50
+; SI-NEXT: v_mov_b32_e32 v50, v30
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; SI-NEXT: v_or_b32_e32 v29, v54, v29
-; SI-NEXT: v_mov_b32_e32 v54, v32
; SI-NEXT: s_branch .LBB55_3
; SI-NEXT: .LBB55_2:
-; SI-NEXT: v_mov_b32_e32 v54, v53
-; SI-NEXT: v_mov_b32_e32 v53, v52
-; SI-NEXT: v_mov_b32_e32 v52, v51
-; SI-NEXT: v_mov_b32_e32 v51, v50
-; SI-NEXT: v_mov_b32_e32 v50, v30
-; SI-NEXT: v_mov_b32_e32 v49, v48
-; SI-NEXT: v_mov_b32_e32 v48, v31
+; SI-NEXT: v_mov_b32_e32 v52, v12
; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; SI-NEXT: v_mov_b32_e32 v47, v36
-; SI-NEXT: v_mov_b32_e32 v46, v35
-; SI-NEXT: v_mov_b32_e32 v44, v43
-; SI-NEXT: v_mov_b32_e32 v30, v50
-; SI-NEXT: v_mov_b32_e32 v50, v51
-; SI-NEXT: v_mov_b32_e32 v51, v52
-; SI-NEXT: v_mov_b32_e32 v52, v53
-; SI-NEXT: v_mov_b32_e32 v53, v54
-; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT: v_mov_b32_e32 v57, v39
-; SI-NEXT: v_mov_b32_e32 v56, v34
+; SI-NEXT: v_mov_b32_e32 v45, v35
+; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: v_mov_b32_e32 v58, v49
+; SI-NEXT: v_mov_b32_e32 v57, v48
+; SI-NEXT: v_mov_b32_e32 v56, v36
+; SI-NEXT: v_mov_b32_e32 v47, v38
+; SI-NEXT: v_mov_b32_e32 v46, v37
; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v44, v34
; SI-NEXT: v_mov_b32_e32 v60, v63
-; SI-NEXT: v_mov_b32_e32 v45, v62
+; SI-NEXT: v_mov_b32_e32 v43, v62
; SI-NEXT: v_mov_b32_e32 v42, v41
; SI-NEXT: v_mov_b32_e32 v40, v55
-; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: v_mov_b32_e32 v31, v48
-; SI-NEXT: v_mov_b32_e32 v48, v49
; SI-NEXT: .LBB55_3: ; %Flow
; SI-NEXT: v_mov_b32_e32 v32, v33
-; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: v_mov_b32_e32 v61, v40
-; SI-NEXT: v_mov_b32_e32 v40, v44
; SI-NEXT: s_cbranch_vccnz .LBB55_5
; SI-NEXT: ; %bb.4: ; %cmp.true
-; SI-NEXT: s_waitcnt expcnt(5)
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt expcnt(4)
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v0, v59
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v58
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v52
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v50
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v31
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v58
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v46
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v45
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v46
-; SI-NEXT: v_cvt_f32_f16_e32 v13, v45
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT: v_cvt_f32_f16_e32 v14, v40
+; SI-NEXT: v_cvt_f32_f16_e32 v13, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v14, v43
; SI-NEXT: v_mov_b32_e32 v55, v42
; SI-NEXT: v_cvt_f32_f16_e32 v15, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v17, v43
+; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT: v_cvt_f16_f32_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT: v_cvt_f32_f16_e32 v19, v54
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v53
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v51
-; SI-NEXT: v_cvt_f32_f16_e32 v24, v48
-; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT: v_cvt_f32_f16_e32 v25, v31
+; SI-NEXT: v_cvt_f32_f16_e32 v17, v53
+; SI-NEXT: v_cvt_f32_f16_e32 v19, v51
; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: s_waitcnt vmcnt(14)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -45184,42 +45411,48 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v2
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT: s_waitcnt vmcnt(11)
-; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: s_waitcnt vmcnt(14)
+; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT: s_waitcnt vmcnt(13)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: s_waitcnt vmcnt(12)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT: s_waitcnt vmcnt(8)
+; SI-NEXT: s_waitcnt vmcnt(11)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: s_waitcnt vmcnt(7)
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT: s_waitcnt vmcnt(10)
+; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: s_waitcnt vmcnt(6)
-; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
; SI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT: s_waitcnt vmcnt(9)
+; SI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT: s_waitcnt vmcnt(7)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT: v_cvt_f16_f32_e32 v24, v24
; SI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT: v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31
; SI-NEXT: v_cvt_f16_f32_e32 v31, v31
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -45232,65 +45465,65 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_or_b32_e32 v5, v6, v5
-; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v6, v7, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v37
+; SI-NEXT: v_cvt_f32_f16_e32 v7, v35
; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7
; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_or_b32_e32 v7, v8, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v57
+; SI-NEXT: v_cvt_f32_f16_e32 v8, v33
; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8
; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_or_b32_e32 v8, v9, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v9, v57
; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9
; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: v_or_b32_e32 v9, v10, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v56
+; SI-NEXT: v_cvt_f32_f16_e32 v10, v47
; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10
; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_or_b32_e32 v10, v11, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v47
+; SI-NEXT: v_cvt_f32_f16_e32 v11, v32
; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11
; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_or_b32_e32 v11, v12, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v12, v60
+; SI-NEXT: v_cvt_f32_f16_e32 v12, v44
; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12
; SI-NEXT: v_cvt_f16_f32_e32 v12, v12
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_or_b32_e32 v12, v13, v12
-; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v13, v13
; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13
@@ -45302,14 +45535,14 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v14, v14
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v14, v15, v14
-; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v15, v15
; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15
; SI-NEXT: v_cvt_f16_f32_e32 v15, v15
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_or_b32_e32 v15, v16, v15
-; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v16, v16
; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16
@@ -45317,9 +45550,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; SI-NEXT: v_or_b32_e32 v16, v17, v16
; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
-; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; SI-NEXT: v_or_b32_e32 v17, v19, v17
-; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v20, v20
; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
@@ -45329,7 +45562,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v18, v18
; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; SI-NEXT: v_or_b32_e32 v18, v20, v18
-; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v19, v19
; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19
@@ -45341,32 +45574,39 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; SI-NEXT: v_cvt_f16_f32_e32 v20, v20
; SI-NEXT: v_or_b32_e32 v19, v20, v19
; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v21, v52
+; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_or_b32_e32 v20, v22, v20
-; SI-NEXT: v_cvt_f32_f16_e32 v22, v50
+; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v21, v21
; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21
; SI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; SI-NEXT: v_or_b32_e32 v21, v23, v21
-; SI-NEXT: v_cvt_f32_f16_e32 v23, v30
+; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT: v_cvt_f16_f32_e32 v22, v22
; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v23, v23
; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23
; SI-NEXT: v_cvt_f16_f32_e32 v23, v23
; SI-NEXT: v_or_b32_e32 v22, v23, v22
; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT: v_cvt_f16_f32_e32 v25, v25
; SI-NEXT: v_or_b32_e32 v23, v25, v23
; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v26, v26
; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26
; SI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT: v_cvt_f16_f32_e32 v30, v30
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v24, v24
; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24
@@ -45451,6 +45691,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_mov_b32_e32 v32, v15
; VI-NEXT: v_mov_b32_e32 v33, v14
; VI-NEXT: v_mov_b32_e32 v34, v13
@@ -45467,7 +45708,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; VI-NEXT: v_mov_b32_e32 v53, v2
; VI-NEXT: v_mov_b32_e32 v54, v1
; VI-NEXT: v_mov_b32_e32 v55, v0
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: s_cbranch_scc0 .LBB55_4
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_mov_b32_e32 v0, 16
@@ -45671,11 +45912,28 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; VI-NEXT: s_setpc_b64 s[30:31]
; VI-NEXT: .LBB55_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; VI-NEXT: s_branch .LBB55_2
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccz .LBB55_2
+; VI-NEXT: s_branch .LBB55_3
;
; GFX9-LABEL: bitcast_v60f16_to_v15f64_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT: s_lshr_b32 s40, s29, 16
+; GFX9-NEXT: s_lshr_b32 s41, s28, 16
+; GFX9-NEXT: s_lshr_b32 s42, s27, 16
+; GFX9-NEXT: s_lshr_b32 s43, s26, 16
+; GFX9-NEXT: s_lshr_b32 s15, s25, 16
+; GFX9-NEXT: s_lshr_b32 s14, s24, 16
+; GFX9-NEXT: s_lshr_b32 s13, s23, 16
+; GFX9-NEXT: s_lshr_b32 s12, s22, 16
+; GFX9-NEXT: s_lshr_b32 s11, s21, 16
+; GFX9-NEXT: s_lshr_b32 s10, s20, 16
+; GFX9-NEXT: s_lshr_b32 s9, s19, 16
+; GFX9-NEXT: s_lshr_b32 s8, s18, 16
+; GFX9-NEXT: s_lshr_b32 s7, s17, 16
+; GFX9-NEXT: s_lshr_b32 s6, s16, 16
; GFX9-NEXT: v_mov_b32_e32 v32, v15
; GFX9-NEXT: v_mov_b32_e32 v33, v14
; GFX9-NEXT: v_mov_b32_e32 v34, v13
@@ -45692,21 +45950,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: v_mov_b32_e32 v53, v2
; GFX9-NEXT: v_mov_b32_e32 v54, v1
; GFX9-NEXT: v_mov_b32_e32 v55, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: s_lshr_b32 s40, s29, 16
-; GFX9-NEXT: s_lshr_b32 s41, s28, 16
-; GFX9-NEXT: s_lshr_b32 s42, s27, 16
-; GFX9-NEXT: s_lshr_b32 s43, s26, 16
-; GFX9-NEXT: s_lshr_b32 s15, s25, 16
-; GFX9-NEXT: s_lshr_b32 s14, s24, 16
-; GFX9-NEXT: s_lshr_b32 s13, s23, 16
-; GFX9-NEXT: s_lshr_b32 s12, s22, 16
-; GFX9-NEXT: s_lshr_b32 s11, s21, 16
-; GFX9-NEXT: s_lshr_b32 s10, s20, 16
-; GFX9-NEXT: s_lshr_b32 s9, s19, 16
-; GFX9-NEXT: s_lshr_b32 s8, s18, 16
-; GFX9-NEXT: s_lshr_b32 s7, s17, 16
-; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -45727,7 +45971,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33
; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34
; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8
@@ -45742,6 +45985,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42
; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41
; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36
; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37
; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38
@@ -45889,7 +46133,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB55_4:
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX9-NEXT: s_branch .LBB55_2
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccz .LBB55_2
+; GFX9-NEXT: s_branch .LBB55_3
;
; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
; GFX11-TRUE16: ; %bb.0:
@@ -45934,41 +46180,41 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -45983,17 +46229,16 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3
; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -46007,24 +46252,24 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -46041,7 +46286,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB55_4:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB55_2
+; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB55_2
+; GFX11-TRUE16-NEXT: s_branch .LBB55_3
;
; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
; GFX11-FAKE16: ; %bb.0:
@@ -46074,41 +46321,41 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16
-; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46
+; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s42
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s41
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s40
+; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4
; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
@@ -46123,17 +46370,16 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3
; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70
@@ -46147,24 +46393,24 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -46181,7 +46427,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-FAKE16-NEXT: .LBB55_4:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB55_2
+; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB55_2
+; GFX11-FAKE16-NEXT: s_branch .LBB55_3
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -47867,9 +48115,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt vmcnt(14)
-; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: s_cbranch_scc0 .LBB57_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_cvt_f32_f16_e32 v31, v1
@@ -47917,86 +48166,87 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s22
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
+; SI-NEXT: v_mov_b32_e32 v36, v35
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v8
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s23
-; SI-NEXT: v_mov_b32_e32 v35, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v9
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s24
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v34
+; SI-NEXT: v_mov_b32_e32 v35, v34
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v10
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s25
-; SI-NEXT: v_mov_b32_e32 v34, v33
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v34
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v11
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
+; SI-NEXT: v_mov_b32_e32 v34, v33
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v12
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s27
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v13
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v54, v32
+; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v14
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s29
-; SI-NEXT: v_mov_b32_e32 v32, v50
+; SI-NEXT: v_cvt_f32_f16_e32 v54, v32
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v15
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v28
-; SI-NEXT: v_cvt_f32_f16_e32 v41, v50
+; SI-NEXT: v_mov_b32_e32 v32, v50
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v16
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v30
-; SI-NEXT: v_cvt_f32_f16_e32 v52, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v41, v50
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v17
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v59
-; SI-NEXT: v_cvt_f32_f16_e32 v55, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v52, v63
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v18
+; SI-NEXT: v_cvt_f32_f16_e32 v55, v62
; SI-NEXT: v_cvt_f32_f16_e32 v50, v61
; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v31, v19
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; SI-NEXT: v_cvt_f32_f16_e32 v51, v58
; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
@@ -48029,27 +48279,27 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr49
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v35, v34
+; SI-NEXT: v_mov_b32_e32 v36, v35
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: v_mov_b32_e32 v34, v33
+; SI-NEXT: v_mov_b32_e32 v35, v34
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: v_mov_b32_e32 v33, v32
+; SI-NEXT: v_mov_b32_e32 v34, v33
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: v_mov_b32_e32 v32, v50
+; SI-NEXT: v_mov_b32_e32 v33, v32
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr49
; SI-NEXT: ; kill: killed $vgpr49
-; SI-NEXT: s_mov_b64 s[4:5], -1
+; SI-NEXT: v_mov_b32_e32 v32, v50
; SI-NEXT: ; implicit-def: $vgpr31
; SI-NEXT: ; kill: killed $vgpr31
; SI-NEXT: ; implicit-def: $vgpr49
@@ -48151,6 +48401,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33
; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34
; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35
+; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v36
; SI-NEXT: s_add_i32 s16, s16, 3
; SI-NEXT: v_cvt_f32_f16_e32 v49, s16
; SI-NEXT: s_add_i32 s17, s17, 3
@@ -48193,50 +48444,49 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s23
; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63
-; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36
+; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s24
-; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37
; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38
+; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s25
-; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39
; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48
+; SI-NEXT: v_cvt_f32_f16_e32 v44, v48
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s26
-; SI-NEXT: v_cvt_f32_f16_e32 v44, v48
; SI-NEXT: v_cvt_f32_f16_e32 v57, v39
; SI-NEXT: v_cvt_f32_f16_e32 v42, v38
+; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s27
-; SI-NEXT: v_cvt_f32_f16_e32 v47, v37
-; SI-NEXT: v_cvt_f32_f16_e32 v40, v36
+; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
; SI-NEXT: v_cvt_f32_f16_e32 v54, v34
+; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s28
-; SI-NEXT: v_cvt_f32_f16_e32 v43, v33
; SI-NEXT: v_cvt_f32_f16_e32 v52, v32
; SI-NEXT: v_cvt_f32_f16_e32 v41, v31
-; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT: s_waitcnt vmcnt(13)
-; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v50, v63
+; SI-NEXT: s_waitcnt vmcnt(12)
+; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, s29
-; SI-NEXT: v_cvt_f32_f16_e32 v45, v35
-; SI-NEXT: v_cvt_f32_f16_e32 v50, v63
+; SI-NEXT: v_cvt_f32_f16_e32 v40, v36
; SI-NEXT: v_cvt_f32_f16_e32 v55, v62
+; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v49, v61
-; SI-NEXT: v_cvt_f32_f16_e32 v53, v60
; SI-NEXT: v_cvt_f32_f16_e32 v51, v58
-; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -48746,11 +48996,12 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; VI-NEXT: s_lshr_b32 s41, s18, 16
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15
; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11
; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10
; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9
@@ -48763,10 +49014,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0
-; VI-NEXT: s_cbranch_scc0 .LBB57_4
+; VI-NEXT: s_cbranch_scc0 .LBB57_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB57_3
-; VI-NEXT: .LBB57_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB57_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB57_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: s_add_i32 s16, s16, 3
; VI-NEXT: s_add_i32 s43, s43, 3
; VI-NEXT: s_add_i32 s17, s17, 3
@@ -48827,7 +49081,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28
; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15
; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29
-; VI-NEXT: .LBB57_3: ; %end
+; VI-NEXT: .LBB57_4: ; %end
; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; VI-NEXT: s_and_b32 s4, 0xffff, s16
; VI-NEXT: s_lshl_b32 s5, s43, 16
@@ -48919,8 +49173,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; VI-NEXT: v_mov_b32_e32 v14, v30
; VI-NEXT: v_mov_b32_e32 v15, v31
; VI-NEXT: s_setpc_b64 s[30:31]
-; VI-NEXT: .LBB57_4:
-; VI-NEXT: s_branch .LBB57_2
;
; GFX9-LABEL: bitcast_v60i16_to_v60f16_scalar:
; GFX9: ; %bb.0:
@@ -48940,11 +49192,12 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s8, s18, 16
; GFX9-NEXT: s_lshr_b32 s7, s17, 16
; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9
@@ -48969,10 +49222,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_cbranch_scc0 .LBB57_3
+; GFX9-NEXT: s_cbranch_scc0 .LBB57_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB57_4
-; GFX9-NEXT: .LBB57_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB57_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB57_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43
; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0]
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42
@@ -49080,8 +49336,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15
; GFX9-NEXT: s_branch .LBB57_5
-; GFX9-NEXT: .LBB57_3:
-; GFX9-NEXT: s_branch .LBB57_2
; GFX9-NEXT: .LBB57_4:
; GFX9-NEXT: v_mov_b32_e32 v30, s29
; GFX9-NEXT: v_mov_b32_e32 v31, s28
@@ -49249,13 +49503,16 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s46, -1
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: .LBB57_2: ; %Flow
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4
-; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
@@ -49359,8 +49616,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
; GFX11-TRUE16-NEXT: s_branch .LBB57_5
-; GFX11-TRUE16-NEXT: .LBB57_3:
-; GFX11-TRUE16-NEXT: s_branch .LBB57_2
; GFX11-TRUE16-NEXT: .LBB57_4:
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
@@ -49480,19 +49735,22 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s46, -1
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: .LBB57_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4
-; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
@@ -49529,10 +49787,10 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -49559,12 +49817,12 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s12, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s11, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s10, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v49, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v49, s8, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v48, s0, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
@@ -49596,8 +49854,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
; GFX11-FAKE16-NEXT: s_branch .LBB57_5
-; GFX11-FAKE16-NEXT: .LBB57_3:
-; GFX11-FAKE16-NEXT: s_branch .LBB57_2
; GFX11-FAKE16-NEXT: .LBB57_4:
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
@@ -49614,8 +49870,8 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s9 :: v_dual_mov_b32 v81, s7
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5
; GFX11-FAKE16-NEXT: .LBB57_5: ; %end
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49
@@ -51066,13 +51322,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_cvt_f16_f32_e32 v58, s25
; SI-NEXT: v_cvt_f16_f32_e32 v57, s29
; SI-NEXT: s_and_b64 s[4:5], vcc, exec
+; SI-NEXT: s_mov_b64 s[4:5], -1
; SI-NEXT: s_cbranch_scc0 .LBB59_2
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_branch .LBB59_3
-; SI-NEXT: .LBB59_2:
-; SI-NEXT: s_mov_b64 s[4:5], -1
-; SI-NEXT: .LBB59_3: ; %Flow
+; SI-NEXT: .LBB59_2: ; %Flow
; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; SI-NEXT: v_mov_b32_e32 v61, v14
; SI-NEXT: v_mov_b32_e32 v63, v15
@@ -51084,8 +51338,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_mov_b32_e32 v8, v5
; SI-NEXT: v_mov_b32_e32 v5, v42
; SI-NEXT: v_mov_b32_e32 v42, v1
-; SI-NEXT: s_cbranch_vccnz .LBB59_5
-; SI-NEXT: ; %bb.4: ; %cmp.true
+; SI-NEXT: s_cbranch_vccnz .LBB59_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v1, v62
@@ -51348,7 +51602,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; SI-NEXT: v_alignbit_b32 v19, v3, v19, 16
; SI-NEXT: v_alignbit_b32 v36, v62, v14, 16
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT: .LBB59_5: ; %end
+; SI-NEXT: .LBB59_4: ; %end
; SI-NEXT: v_and_b32_e32 v39, 0xffff, v47
; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v60
; SI-NEXT: v_or_b32_e32 v39, v39, v50
@@ -51562,11 +51816,12 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; VI-NEXT: s_lshr_b32 s41, s18, 16
; VI-NEXT: s_lshr_b32 s42, s17, 16
; VI-NEXT: s_lshr_b32 s43, s16, 16
+; VI-NEXT: s_and_b64 s[4:5], vcc, exec
; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15
; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13
; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12
-; VI-NEXT: s_and_b64 s[4:5], vcc, exec
+; VI-NEXT: s_mov_b64 s[4:5], -1
; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11
; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10
; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9
@@ -51591,10 +51846,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: s_cbranch_scc0 .LBB59_3
+; VI-NEXT: s_cbranch_scc0 .LBB59_2
; VI-NEXT: ; %bb.1: ; %cmp.false
-; VI-NEXT: s_cbranch_execnz .LBB59_4
-; VI-NEXT: .LBB59_2: ; %cmp.true
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB59_2: ; %Flow
+; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_cbranch_vccnz .LBB59_4
+; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_mov_b32_e32 v54, 0x200
; VI-NEXT: v_add_f16_e32 v32, s16, v54
; VI-NEXT: v_add_f16_e32 v59, s43, v54
@@ -51657,8 +51915,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; VI-NEXT: v_add_f16_e32 v15, 0x200, v15
; VI-NEXT: v_add_f16_e32 v29, 0x200, v29
; VI-NEXT: s_branch .LBB59_5
-; VI-NEXT: .LBB59_3:
-; VI-NEXT: s_branch .LBB59_2
; VI-NEXT: .LBB59_4:
; VI-NEXT: v_mov_b32_e32 v54, s6
; VI-NEXT: v_mov_b32_e32 v53, s29
@@ -51798,11 +52054,12 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: s_lshr_b32 s8, s18, 16
; GFX9-NEXT: s_lshr_b32 s7, s17, 16
; GFX9-NEXT: s_lshr_b32 s6, s16, 16
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12
-; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_mov_b64 s[4:5], -1
; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9
@@ -51827,10 +52084,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: s_cbranch_scc0 .LBB59_3
+; GFX9-NEXT: s_cbranch_scc0 .LBB59_2
; GFX9-NEXT: ; %bb.1: ; %cmp.false
-; GFX9-NEXT: s_cbranch_execnz .LBB59_4
-; GFX9-NEXT: .LBB59_2: ; %cmp.true
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB59_2: ; %Flow
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_vccnz .LBB59_4
+; GFX9-NEXT: ; %bb.3: ; %cmp.true
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13
@@ -51940,8 +52200,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15
; GFX9-NEXT: s_branch .LBB59_5
-; GFX9-NEXT: .LBB59_3:
-; GFX9-NEXT: s_branch .LBB59_2
; GFX9-NEXT: .LBB59_4:
; GFX9-NEXT: v_mov_b32_e32 v30, s29
; GFX9-NEXT: v_mov_b32_e32 v31, s28
@@ -52109,13 +52367,16 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16
+; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s46, -1
+; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
-; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT: .LBB59_2: ; %Flow
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4
-; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
@@ -52219,8 +52480,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
; GFX11-TRUE16-NEXT: s_branch .LBB59_5
-; GFX11-TRUE16-NEXT: .LBB59_3:
-; GFX11-TRUE16-NEXT: s_branch .LBB59_2
; GFX11-TRUE16-NEXT: .LBB59_4:
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
@@ -52340,19 +52599,22 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s16, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s2, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s46, -1
+; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0
-; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT: .LBB59_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4
-; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
@@ -52389,10 +52651,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s16, s6
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s9
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
@@ -52419,12 +52681,12 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s12 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s11 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v49, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v49, 0x200, s8 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v48, 0x200, s0 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38
@@ -52456,8 +52718,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
; GFX11-FAKE16-NEXT: s_branch .LBB59_5
-; GFX11-FAKE16-NEXT: .LBB59_3:
-; GFX11-FAKE16-NEXT: s_branch .LBB59_2
; GFX11-FAKE16-NEXT: .LBB59_4:
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
@@ -52474,8 +52734,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s8 :: v_dual_mov_b32 v71, s6
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s9 :: v_dual_mov_b32 v81, s7
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5
; GFX11-FAKE16-NEXT: .LBB59_5: ; %end
; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49