summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/load-global-i8.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i8.ll1230
1 files changed, 643 insertions, 587 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 0c399d65d01c..b75c8c7e4177 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -6267,26 +6267,29 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v1
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s10, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s5, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s10, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[10:11], 0x80000
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s4, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s4, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s10, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s10, 24
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[14:15], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16
@@ -6295,17 +6298,17 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s9
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -6319,47 +6322,48 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-HSA-NEXT: s_mov_b32 s3, 0
+; GCN-HSA-NEXT: s_mov_b32 s5, s3
+; GCN-HSA-NEXT: s_mov_b32 s7, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1
-; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v0
-; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16
-; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16
-; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 24
-; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8
-; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8
-; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31
+; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v1
+; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-HSA-NEXT: s_lshr_b32 s2, s6, 16
+; GCN-HSA-NEXT: s_lshr_b32 s8, s4, 16
+; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 24
+; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 8
+; GCN-HSA-NEXT: s_lshr_b32 s4, s6, 8
+; GCN-HSA-NEXT: s_ashr_i32 s13, s6, 31
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_ashr_i32 s16, s6, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s9
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
@@ -6367,6 +6371,8 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_endpgm
@@ -6382,25 +6388,27 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s8, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 24
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 8
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s4, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s4, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s8, 8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s8, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s8, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[8:9], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s18
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
@@ -6408,14 +6416,14 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -6931,77 +6939,83 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, 0
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s13
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s13
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, s13
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s17, s13
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s13
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s18, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s18, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s18, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s19, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s19, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s19
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s19, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s19, 24
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x80000
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[14:15], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s23
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11
@@ -7025,109 +7039,115 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: s_mov_b32 s3, 0
+; GCN-HSA-NEXT: s_mov_b32 s5, s3
+; GCN-HSA-NEXT: s_mov_b32 s7, s3
+; GCN-HSA-NEXT: s_mov_b32 s9, s3
+; GCN-HSA-NEXT: s_mov_b32 s11, s3
+; GCN-HSA-NEXT: s_mov_b32 s13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
-; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
-; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
-; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16
-; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24
-; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8
-; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16
-; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8
-; GCN-HSA-NEXT: s_mov_b32 s22, s3
-; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31
-; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16
-; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24
-; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
-; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16
-; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
-; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8
-; GCN-HSA-NEXT: s_mov_b32 s24, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s14, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s15, v3
+; GCN-HSA-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s17, v1
+; GCN-HSA-NEXT: s_lshr_b32 s18, s14, 16
+; GCN-HSA-NEXT: s_lshr_b32 s20, s14, 24
+; GCN-HSA-NEXT: s_lshr_b32 s22, s14, 8
+; GCN-HSA-NEXT: s_lshr_b32 s2, s15, 16
+; GCN-HSA-NEXT: s_lshr_b32 s4, s15, 8
+; GCN-HSA-NEXT: s_mov_b32 s6, s15
+; GCN-HSA-NEXT: s_ashr_i32 s8, s15, 31
+; GCN-HSA-NEXT: s_ashr_i32 s10, s15, 24
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
+; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[16:17], 0x80000
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s10
+; GCN-HSA-NEXT: s_ashr_i32 s10, s17, 31
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8
+; GCN-HSA-NEXT: s_ashr_i32 s8, s17, 24
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GCN-HSA-NEXT: s_lshr_b32 s14, s16, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s10
+; GCN-HSA-NEXT: s_lshr_b32 s24, s16, 24
+; GCN-HSA-NEXT: s_lshr_b32 s8, s17, 16
+; GCN-HSA-NEXT: s_lshr_b32 s10, s17, 8
+; GCN-HSA-NEXT: s_mov_b32 s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 8
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15
; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50
+; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x50
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
-; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
-; GCN-HSA-NEXT: s_add_u32 s6, s0, 64
-; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s24
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s25
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
+; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s8
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GCN-HSA-NEXT: s_endpgm
@@ -7144,65 +7164,69 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s15, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s15
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s15
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s15
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 24
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s4, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s12, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s12, 24
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s12, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s5, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s5
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s13, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s13, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s13
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s13, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s13, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s17
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s18
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s19
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s22
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s23
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s24
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s25
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s26
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s15
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
@@ -8171,170 +8195,184 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s37, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s17, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s21, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s23, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s25, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s27, s7
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s28, v2
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s29, v3
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s34, v0
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s35, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4
-; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s30, v6
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s31, v7
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v4
+; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v5
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s28, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s28, 24
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[34:35], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s39
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[4:5], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s28, 8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s41
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s34, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[8:9], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[6:7], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s39
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s34, 24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s41
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s34, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s29
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s37
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s30, 16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s39
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s30, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s29, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s29, 8
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s35, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s35, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s35
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s31, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s31, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s31
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[22:23], 0x80000
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s37
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s31, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s35, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s35, 24
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s29, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s29, 24
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s31, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s30, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[24:25], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[26:27], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s37
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s4, 16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s4, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 8
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s29
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s5, 31
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s35
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s5, 24
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s4, 8
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[16:17], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[24:25], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[38:39], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s44
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s30
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s31
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s36
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s37
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s42
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s43
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s45
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s7
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s20
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s9
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s46
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s14
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s15
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s18
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s19
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s17
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64:
@@ -8352,174 +8390,172 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT: s_mov_b32 s3, 0
+; GCN-HSA-NEXT: s_mov_b32 s19, s3
+; GCN-HSA-NEXT: s_mov_b32 s5, s3
+; GCN-HSA-NEXT: s_mov_b32 s7, s3
+; GCN-HSA-NEXT: s_mov_b32 s29, s3
+; GCN-HSA-NEXT: s_mov_b32 s15, s3
+; GCN-HSA-NEXT: s_mov_b32 s31, s3
+; GCN-HSA-NEXT: s_mov_b32 s13, s3
+; GCN-HSA-NEXT: s_mov_b32 s9, s3
+; GCN-HSA-NEXT: s_mov_b32 s11, s3
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6
-; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4
-; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5
-; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7
-; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16
-; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24
-; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16
-; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8
-; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8
-; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16
-; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16
-; GCN-HSA-NEXT: s_mov_b32 s28, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27
-; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000
+; GCN-HSA-NEXT: v_readfirstlane_b32 s16, v6
+; GCN-HSA-NEXT: v_readfirstlane_b32 s17, v7
+; GCN-HSA-NEXT: v_readfirstlane_b32 s20, v4
+; GCN-HSA-NEXT: v_readfirstlane_b32 s21, v5
+; GCN-HSA-NEXT: s_lshr_b32 s18, s16, 24
+; GCN-HSA-NEXT: s_lshr_b32 s14, s16, 16
+; GCN-HSA-NEXT: s_lshr_b32 s26, s20, 16
+; GCN-HSA-NEXT: s_lshr_b32 s28, s20, 24
+; GCN-HSA-NEXT: s_lshr_b32 s30, s20, 8
+; GCN-HSA-NEXT: s_lshr_b32 s4, s17, 8
+; GCN-HSA-NEXT: s_mov_b32 s6, s17
+; GCN-HSA-NEXT: s_lshr_b32 s12, s21, 16
+; GCN-HSA-NEXT: s_lshr_b32 s8, s21, 8
+; GCN-HSA-NEXT: s_mov_b32 s10, s21
+; GCN-HSA-NEXT: s_ashr_i32 s37, s21, 31
+; GCN-HSA-NEXT: s_ashr_i32 s38, s21, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[20:21], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s34, s16, 8
+; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[18:19], 0x80000
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2
-; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3
-; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000
-; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0
-; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_mov_b32 s22, s7
-; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
-; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16
-; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24
-; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8
-; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16
-; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8
-; GCN-HSA-NEXT: s_mov_b32 s4, s45
-; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000
-; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16
-; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24
-; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8
-; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16
-; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8
-; GCN-HSA-NEXT: s_mov_b32 s14, s41
-; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31
-; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31
-; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24
-; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24
+; GCN-HSA-NEXT: v_readfirstlane_b32 s48, v2
+; GCN-HSA-NEXT: v_readfirstlane_b32 s49, v3
+; GCN-HSA-NEXT: v_readfirstlane_b32 s50, v0
+; GCN-HSA-NEXT: v_readfirstlane_b32 s51, v1
+; GCN-HSA-NEXT: s_lshr_b32 s2, s17, 16
+; GCN-HSA-NEXT: s_ashr_i32 s41, s17, 31
+; GCN-HSA-NEXT: s_ashr_i32 s42, s17, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[16:17], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[14:15], 0x80000
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s21
+; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[26:27], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[58:59], s[34:35], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s18, s49, 16
+; GCN-HSA-NEXT: s_lshr_b32 s28, s49, 8
+; GCN-HSA-NEXT: s_mov_b32 s14, s49
+; GCN-HSA-NEXT: s_lshr_b32 s30, s51, 16
+; GCN-HSA-NEXT: s_lshr_b32 s6, s51, 8
+; GCN-HSA-NEXT: s_mov_b32 s4, s51
+; GCN-HSA-NEXT: s_lshr_b32 s34, s50, 16
+; GCN-HSA-NEXT: s_lshr_b32 s60, s50, 24
+; GCN-HSA-NEXT: s_lshr_b32 s62, s50, 8
+; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[50:51], 0x80000
+; GCN-HSA-NEXT: s_lshr_b32 s50, s48, 16
+; GCN-HSA-NEXT: s_lshr_b32 s64, s48, 24
+; GCN-HSA-NEXT: s_lshr_b32 s66, s48, 8
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
+; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31
-; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24
-; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31
-; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24
-; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[12:13], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x80000
+; GCN-HSA-NEXT: s_ashr_i32 s33, s51, 31
+; GCN-HSA-NEXT: s_ashr_i32 s36, s51, 24
+; GCN-HSA-NEXT: s_ashr_i32 s39, s49, 31
+; GCN-HSA-NEXT: s_ashr_i32 s40, s49, 24
+; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[28:29], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[62:63], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[60:61], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[60:61], s[66:67], 0x80000
+; GCN-HSA-NEXT: s_bfe_i64 s[62:63], s[64:65], 0x80000
; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
-; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50
-; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55
-; GCN-HSA-NEXT: s_add_u32 s54, s0, 64
-; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55
-; GCN-HSA-NEXT: s_add_u32 s54, s0, 16
-; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40
-; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0
-; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41
-; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41
-; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0
-; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55
-; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
+; GCN-HSA-NEXT: s_add_u32 s64, s0, 0x50
+; GCN-HSA-NEXT: s_addc_u32 s65, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46
+; GCN-HSA-NEXT: s_add_u32 s46, s0, 64
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47
+; GCN-HSA-NEXT: s_addc_u32 s47, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s64
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47
+; GCN-HSA-NEXT: s_add_u32 s46, s0, 16
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s65
+; GCN-HSA-NEXT: s_addc_u32 s47, s1, 0
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s58
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34
+; GCN-HSA-NEXT: s_add_u32 s34, s0, 0xd0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35
+; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v24, s34
+; GCN-HSA-NEXT: v_mov_b32_e32 v25, s35
+; GCN-HSA-NEXT: s_add_u32 s34, s0, 0xc0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s59
+; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s56
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s57
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s54
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s55
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26
-; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v29, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v28, s26
; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
-; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
-; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70
-; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45
-; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50
-; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s45
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s52
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s53
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52
-; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49
-; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26
-; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18
-; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31
+; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15]
+; GCN-HSA-NEXT: v_mov_b32_e32 v31, s27
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20
+; GCN-HSA-NEXT: s_add_u32 s20, s0, 0x60
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29
-; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27
-; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
-; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38
-; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
-; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23
-; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19
-; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
-; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
+; GCN-HSA-NEXT: v_mov_b32_e32 v30, s26
+; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7]
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24
+; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, s20
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
+; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25
+; GCN-HSA-NEXT: v_mov_b32_e32 v7, s21
+; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
; GCN-HSA-NEXT: s_add_u32 s16, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8
; GCN-HSA-NEXT: s_add_u32 s8, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
@@ -8532,10 +8568,10 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
-; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35
+; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -8547,24 +8583,34 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT: s_nop 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v18, s62
+; GCN-HSA-NEXT: v_mov_b32_e32 v19, s63
+; GCN-HSA-NEXT: v_mov_b32_e32 v26, s34
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v20, s48
+; GCN-HSA-NEXT: v_mov_b32_e32 v21, s49
+; GCN-HSA-NEXT: v_mov_b32_e32 v22, s60
+; GCN-HSA-NEXT: v_mov_b32_e32 v23, s61
+; GCN-HSA-NEXT: v_mov_b32_e32 v27, s35
+; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
+; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[20:23]
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
@@ -8582,151 +8628,161 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s31, 0
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s25, s31
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s23, s31
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s31
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s15, s31
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s31
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s31
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s31
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
-; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 24
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
-; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s4, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s12, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s12, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
+; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s18, v6
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s12, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s18, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s71
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s44
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s19, v7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s26, v4
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s18, 8
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s26, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s26, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[18:19], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s27, v5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s26, 8
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[26:27], 0x80000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s5, 8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s59
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s24, s5
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s19, 24
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s5, 31
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s13, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s27, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s27, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s27
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s27, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s27, 24
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[24:25], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[62:63], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s13, 8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s13
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s13, 31
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 24
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s19, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s19, 8
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s19
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s19, 31
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[16:17], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[64:65], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[66:67], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s63
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x80000
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44
+; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4