summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll39
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll21
-rw-r--r--llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_int24.ll49
-rw-r--r--llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/sra.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll258
13 files changed, 269 insertions, 317 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 0a2e758f7cf2..50cc28810000 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -879,44 +879,43 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: s_load_dword s4, s[4:5], 0xe
+; SI-NEXT: s_brev_b32 s5, -2
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_bfi_b32 v0, s5, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
-; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_load_dword s3, s[4:5], 0x38
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s0, -2
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_bfi_b32 v2, s4, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x38
+; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s3, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%sign.trunc = fptrunc double %sign to float
%result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 8b5c34d97e50..674924e3a925 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -13,14 +13,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: s_load_dword s4, s[4:5], 0x1e
+; SI-NEXT: s_brev_b32 s5, -2
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_bfi_b32 v1, s4, v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_bfi_b32 v1, s5, v0, v1
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -28,32 +28,32 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; VI-LABEL: s_test_copysign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
-; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x74
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT: s_load_dword s6, s[4:5], 0x78
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_bfi_b32 v1, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x74
-; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x4c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x78
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c
+; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_endpgm
%result = call double @llvm.copysign.f64(double %mag, double %sign)
store double %result, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
index 5df61f19033a..d794dcf06f99 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll
@@ -8,7 +8,7 @@ define amdgpu_gs i32 @vgpr16_copyto_sgpr() {
; CHECK-LABEL: vgpr16_copyto_sgpr:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_mov_b32_e32 v0, lds@abs32@lo
-; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
+; CHECK-NEXT: ds_load_b32 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
index 6e82a294243d..b1becd0409a3 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
@@ -19,31 +19,30 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; FORWARDXNACK-LABEL: shuffle_v4f16_234u:
; FORWARDXNACK: ; %bb.0:
; FORWARDXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FORWARDXNACK-NEXT: global_load_dword v6, v[0:1], off offset:4
-; FORWARDXNACK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; FORWARDXNACK-NEXT: global_load_dword v4, v[0:1], off offset:4
+; FORWARDXNACK-NEXT: global_load_dword v5, v[2:3], off
; FORWARDXNACK-NEXT: s_waitcnt vmcnt(1)
-; FORWARDXNACK-NEXT: v_mov_b32_e32 v0, v6
+; FORWARDXNACK-NEXT: v_mov_b32_e32 v0, v4
; FORWARDXNACK-NEXT: s_waitcnt vmcnt(0)
-; FORWARDXNACK-NEXT: v_mov_b32_e32 v1, v4
+; FORWARDXNACK-NEXT: v_mov_b32_e32 v1, v5
; FORWARDXNACK-NEXT: s_setpc_b64 s[30:31]
;
; REVERSEXNACK-LABEL: shuffle_v4f16_234u:
; REVERSEXNACK: ; %bb.0:
; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1
-; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0
-; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3
-; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2
-; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4
-; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off
+; REVERSEXNACK-NEXT: global_load_dword v5, v[0:1], off offset:4
+; REVERSEXNACK-NEXT: global_load_dword v4, v[2:3], off
+; REVERSEXNACK-NEXT: s_waitcnt vmcnt(1)
+; REVERSEXNACK-NEXT: v_mov_b32_e32 v0, v5
; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0)
+; REVERSEXNACK-NEXT: v_mov_b32_e32 v1, v4
; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31]
;
; NOXNACK-LABEL: shuffle_v4f16_234u:
; NOXNACK: ; %bb.0:
; NOXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NOXNACK-NEXT: global_load_dword v0, v[0:1], off offset:4
-; NOXNACK-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
+; NOXNACK-NEXT: global_load_dword v1, v[2:3], off
; NOXNACK-NEXT: s_waitcnt vmcnt(0)
; NOXNACK-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, ptr addrspace(1) %arg0
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 76f204dd0c16..8ad8a5405e11 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -29,21 +29,20 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v6, s67, 17
; CHECK-NEXT: v_writelane_b32 v6, s68, 18
; CHECK-NEXT: s_getpc_b64 s[4:5]
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
; CHECK-NEXT: v_writelane_b32 v6, s69, 19
-; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: s_mov_b32 s69, s4
-; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_mov_b32 s68, 0
+; CHECK-NEXT: s_load_dword s6, s[4:5], 0x0
; CHECK-NEXT: s_load_dwordx8 s[24:31], s[68:69], 0x30
; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0xf0
-; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: ; kill: killed $sgpr4_sgpr5
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130
; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v6, s70, 20
; CHECK-NEXT: v_writelane_b32 v6, s71, 21
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_writelane_b32 v7, s8, 0
; CHECK-NEXT: v_writelane_b32 v7, s9, 1
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 2daed9b69384..88d60c5fac44 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -112,10 +112,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_load_dword s0, s[8:9], 0x4
+; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x40
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT: s_cmp_eq_u32 s0, s1
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
@@ -124,10 +124,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
;
; GFX8V5-LABEL: llvm_amdgcn_is_shared:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
-; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xcc
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
@@ -166,10 +166,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
; GFX8V4-LABEL: llvm_amdgcn_is_private:
; GFX8V4: ; %bb.0:
-; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_load_dword s0, s[8:9], 0x4
+; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x44
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT: s_cmp_eq_u32 s0, s1
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
@@ -178,10 +178,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
;
; GFX8V5-LABEL: llvm_amdgcn_is_private:
; GFX8V5: ; %bb.0:
-; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
-; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4
+; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xc8
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 10d4eb029ee3..36dabd858c70 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -459,18 +459,18 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
; SI-LABEL: test_smul24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_load_dword s6, s[4:5], 0xd
+; SI-NEXT: s_load_dword s4, s[4:5], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s5, s6, 0x180000
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: s_mul_i32 s4, s5, s4
-; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_mul_i32 s5, s4, s5
+; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0
+; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -478,11 +478,12 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
;
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_load_dword s3, s[4:5], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x180000
-; VI-NEXT: s_bfe_i32 s3, s4, 0x180000
+; VI-NEXT: s_bfe_i32 s3, s3, 0x180000
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
@@ -569,28 +570,28 @@ entry:
define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
; SI-LABEL: test_smulhi24_i33:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_load_dword s6, s[4:5], 0xd
+; SI-NEXT: s_load_dword s7, s[4:5], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b64 s[6:7], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0
+; VI-NEXT: s_load_dword s6, s[4:5], 0x34
+; VI-NEXT: s_load_dword s7, s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
index 8f3acece55ce..8bf14a013149 100644
--- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
@@ -16,7 +16,7 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
.entry:
%tmp31 = sext i32 %arg18 to i64
%tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) @indexable, i64 0, i64 %tmp31
- %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+ %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
%tmp34 = extractelement <3 x float> %tmp33, i32 0
ret float %tmp34
}
@@ -31,7 +31,7 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
%tmp1 = zext i32 %arg18 to i64
%tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
%tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
- %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+ %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
%tmp34 = extractelement <3 x float> %tmp33, i32 0
ret float %tmp34
}
@@ -46,7 +46,7 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) {
%tmp1 = zext i32 %arg18 to i64
%tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
%tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 1
- %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+ %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
%tmp34 = extractelement <3 x float> %tmp33, i32 0
ret float %tmp34
}
@@ -61,7 +61,7 @@ define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
%tmp1 = zext i32 %arg18 to i64
%tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
%tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
- %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
+ %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16
%tmp34 = extractelement <3 x float> %tmp33, i32 0
ret float %tmp34
}
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index 80c0d0f45eb9..508bd78785b6 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -830,16 +830,16 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @s_ashr_33_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
; SI-LABEL: s_ashr_33_i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT: s_load_dword s6, s[4:5], 0x14
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_ashr_i32 s6, s7, 31
-; SI-NEXT: s_ashr_i32 s7, s7, 1
-; SI-NEXT: s_add_u32 s4, s7, s4
-; SI-NEXT: s_addc_u32 s5, s6, s5
+; SI-NEXT: s_ashr_i32 s7, s6, 31
+; SI-NEXT: s_ashr_i32 s6, s6, 1
+; SI-NEXT: s_add_u32 s4, s6, s4
+; SI-NEXT: s_addc_u32 s5, s7, s5
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -847,16 +847,16 @@ define amdgpu_kernel void @s_ashr_33_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_33_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; VI-NEXT: s_load_dword s6, s[4:5], 0x50
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s6, s7, 31
-; VI-NEXT: s_ashr_i32 s7, s7, 1
-; VI-NEXT: s_add_u32 s4, s7, s4
-; VI-NEXT: s_addc_u32 s5, s6, s5
+; VI-NEXT: s_ashr_i32 s7, s6, 31
+; VI-NEXT: s_ashr_i32 s6, s6, 1
+; VI-NEXT: s_add_u32 s4, s6, s4
+; VI-NEXT: s_addc_u32 s5, s7, s5
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -953,16 +953,16 @@ define amdgpu_kernel void @v_ashr_33_i64(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @s_ashr_62_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
; SI-LABEL: s_ashr_62_i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT: s_load_dword s6, s[4:5], 0x14
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_ashr_i32 s6, s7, 31
-; SI-NEXT: s_ashr_i32 s7, s7, 30
-; SI-NEXT: s_add_u32 s4, s7, s4
-; SI-NEXT: s_addc_u32 s5, s6, s5
+; SI-NEXT: s_ashr_i32 s7, s6, 31
+; SI-NEXT: s_ashr_i32 s6, s6, 30
+; SI-NEXT: s_add_u32 s4, s6, s4
+; SI-NEXT: s_addc_u32 s5, s7, s5
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -970,16 +970,16 @@ define amdgpu_kernel void @s_ashr_62_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_62_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; VI-NEXT: s_load_dword s6, s[4:5], 0x50
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s6, s7, 31
-; VI-NEXT: s_ashr_i32 s7, s7, 30
-; VI-NEXT: s_add_u32 s4, s7, s4
-; VI-NEXT: s_addc_u32 s5, s6, s5
+; VI-NEXT: s_ashr_i32 s7, s6, 31
+; VI-NEXT: s_ashr_i32 s6, s6, 30
+; VI-NEXT: s_add_u32 s4, s6, s4
+; VI-NEXT: s_addc_u32 s5, s7, s5
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1077,15 +1077,15 @@ define amdgpu_kernel void @v_ashr_62_i64(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
; SI-LABEL: s_ashr_63_i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x1d
+; SI-NEXT: s_load_dword s8, s[4:5], 0x14
+; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_ashr_i32 s5, s7, 31
-; SI-NEXT: s_add_u32 s4, s5, s8
-; SI-NEXT: s_addc_u32 s5, s5, s9
+; SI-NEXT: s_ashr_i32 s5, s8, 31
+; SI-NEXT: s_add_u32 s4, s5, s6
+; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1093,15 +1093,15 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_63_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x74
+; VI-NEXT: s_load_dword s8, s[4:5], 0x50
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x74
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s5, s7, 31
-; VI-NEXT: s_add_u32 s4, s5, s8
-; VI-NEXT: s_addc_u32 s5, s5, s9
+; VI-NEXT: s_ashr_i32 s5, s8, 31
+; VI-NEXT: s_add_u32 s4, s5, s6
+; VI-NEXT: s_addc_u32 s5, s5, s7
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index 76f60f1e5dbf..08f46b458621 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -374,7 +374,7 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) {
define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) {
; SI-LABEL: s_trunc_i64_to_i1:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT: s_load_dword s6, s[4:5], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
@@ -387,14 +387,14 @@ define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i
;
; VI-LABEL: s_trunc_i64_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
-; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT: s_load_dword s2, s[4:5], 0x4c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s0, 0
-; VI-NEXT: s_cselect_b32 s0, 63, -12
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_bitcmp1_b32 s2, 0
+; VI-NEXT: s_cselect_b32 s2, 63, -12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
index 8af4a8de7b26..06765dcffe4b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
@@ -71,12 +71,12 @@ define <3 x i32> @test_add3x32(ptr %a_ptr, ptr %b_ptr) {
; CHECK-LABEL: test_add3x32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_load_dword v4, v[2:3]
-; CHECK-NEXT: flat_load_dword v5, v[0:1]
+; CHECK-NEXT: flat_load_dword v4, v[0:1]
+; CHECK-NEXT: flat_load_dword v5, v[2:3]
; CHECK-NEXT: v_mov_b32_e32 v1, 48
; CHECK-NEXT: v_mov_b32_e32 v2, 48
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_or_b32_e32 v0, v5, v4
+; CHECK-NEXT: v_or_b32_e32 v0, v4, v5
; CHECK-NEXT: s_setpc_b64 s[30:31]
%a = load <3 x i32>, ptr %a_ptr, !range !2, !noundef !{}
%b = load <3 x i32>, ptr %b_ptr, !range !3, !noundef !{}
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index 504554037c53..ec2aa86a9505 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -1847,25 +1847,25 @@ define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) {
; GFX9-LABEL: shuffle_v2f32_rebroadcast:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v2f32_rebroadcast:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: shuffle_v2f32_rebroadcast:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%val0 = load <2 x float>, ptr addrspace(1) %arg0
@@ -1877,28 +1877,28 @@ define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) {
; GFX9-LABEL: shuffle_v3f32_rebroadcast:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v3f32_rebroadcast:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: shuffle_v3f32_rebroadcast:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%val0 = load <3 x float>, ptr addrspace(1) %arg0
@@ -1910,31 +1910,31 @@ define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) {
; GFX9-LABEL: shuffle_v4f32_rebroadcast:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4f32_rebroadcast:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: shuffle_v4f32_rebroadcast:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%val0 = load <4 x float>, ptr addrspace(1) %arg0
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 6bf6d540299f..1faf7763699c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -33,44 +33,33 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
}
define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GX900-LABEL: shuffle_v4f16_234u:
-; GX900: ; %bb.0:
-; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; GX900-NEXT: s_waitcnt vmcnt(1)
-; GX900-NEXT: v_mov_b32_e32 v0, v6
-; GX900-NEXT: s_waitcnt vmcnt(0)
-; GX900-NEXT: v_mov_b32_e32 v1, v4
-; GX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: shuffle_v4f16_234u:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v4, v[0:1], off offset:4
-; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
-; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: shuffle_v4f16_234u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4f16_234u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: shuffle_v4f16_234u:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, ptr addrspace(1) %arg0
@@ -352,36 +341,33 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GX900-LABEL: shuffle_v4f16_357u:
; GX900: ; %bb.0:
; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GX900-NEXT: s_mov_b32 s4, 0x7060302
-; GX900-NEXT: s_waitcnt vmcnt(1)
-; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16
; GX900-NEXT: s_waitcnt vmcnt(0)
; GX900-NEXT: v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16
; GX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: shuffle_v4f16_357u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX942-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX942-NEXT: s_mov_b32 s0, 0x7060302
-; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0
+; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4f16_357u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: shuffle_v4f16_357u:
@@ -397,12 +383,11 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_357u:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
-; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, ptr addrspace(1) %arg0
%val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1082,23 +1067,21 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX9-LABEL: shuffle_v4f16_3456:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4f16_3456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: shuffle_v4f16_3456:
@@ -1117,12 +1100,11 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_3456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, ptr addrspace(1) %arg0
%val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1134,12 +1116,11 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX9-LABEL: shuffle_v4f16_5634:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4f16_5634:
@@ -1348,7 +1329,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GX900-LABEL: shuffle_v4f16_0000:
; GX900: ; %bb.0:
; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GX900-NEXT: global_load_dword v0, v[0:1], off
; GX900-NEXT: s_mov_b32 s4, 0x5040100
; GX900-NEXT: s_waitcnt vmcnt(0)
; GX900-NEXT: v_perm_b32 v0, v0, v0, s4
@@ -1358,7 +1339,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX942-LABEL: shuffle_v4f16_0000:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942-NEXT: global_load_dword v0, v[0:1], off
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_perm_b32 v0, v0, v0, s0
@@ -1368,7 +1349,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX10-LABEL: shuffle_v4f16_0000:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
; GFX10-NEXT: v_mov_b32_e32 v1, v0
@@ -1377,7 +1358,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_0000:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1387,7 +1368,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_0000:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2107,44 +2088,40 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GX900-LABEL: shuffle_v4f16_0456:
; GX900: ; %bb.0:
; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT: global_load_dword v6, v[0:1], off
+; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GX900-NEXT: s_mov_b32 s4, 0x5040100
-; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1
-; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GX900-NEXT: s_waitcnt vmcnt(0)
-; GX900-NEXT: v_perm_b32 v0, v5, v4, s4
-; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT: v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: shuffle_v4f16_0456:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
+; GFX942-NEXT: global_load_dword v6, v[0:1], off
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_perm_b32 v0, v6, v4, s0
-; GFX942-NEXT: v_alignbit_b32 v1, v7, v6, 16
+; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0
+; GFX942-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4f16_0456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
-; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
-; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100
-; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16
+; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: shuffle_v4f16_0456:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
-; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
@@ -2154,7 +2131,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-FAKE16-LABEL: shuffle_v4f16_0456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
@@ -3323,44 +3300,33 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
}
define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GX900-LABEL: shuffle_v4bf16_234u:
-; GX900: ; %bb.0:
-; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; GX900-NEXT: s_waitcnt vmcnt(1)
-; GX900-NEXT: v_mov_b32_e32 v0, v6
-; GX900-NEXT: s_waitcnt vmcnt(0)
-; GX900-NEXT: v_mov_b32_e32 v1, v4
-; GX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: shuffle_v4bf16_234u:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v4, v[0:1], off offset:4
-; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
-; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: shuffle_v4bf16_234u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4bf16_234u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: shuffle_v4bf16_234u:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3642,36 +3608,33 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
; GX900-LABEL: shuffle_v4bf16_357u:
; GX900: ; %bb.0:
; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GX900-NEXT: s_mov_b32 s4, 0x7060302
-; GX900-NEXT: s_waitcnt vmcnt(1)
-; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16
; GX900-NEXT: s_waitcnt vmcnt(0)
; GX900-NEXT: v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16
; GX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: shuffle_v4bf16_357u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX942-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX942-NEXT: s_mov_b32 s0, 0x7060302
-; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0
+; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4bf16_357u:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: shuffle_v4bf16_357u:
@@ -3687,12 +3650,11 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
-; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
%val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4372,23 +4334,21 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX9-LABEL: shuffle_v4bf16_3456:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4bf16_3456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: shuffle_v4bf16_3456:
@@ -4407,12 +4367,11 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_3456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off
; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
%val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4424,12 +4383,11 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX9-LABEL: shuffle_v4bf16_5634:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4bf16_5634:
@@ -4542,7 +4500,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
; GX900-LABEL: shuffle_v4bf16_0000:
; GX900: ; %bb.0:
; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GX900-NEXT: global_load_dword v0, v[0:1], off
; GX900-NEXT: s_mov_b32 s4, 0x5040100
; GX900-NEXT: s_waitcnt vmcnt(0)
; GX900-NEXT: v_perm_b32 v0, v0, v0, s4
@@ -4552,7 +4510,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX942-LABEL: shuffle_v4bf16_0000:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942-NEXT: global_load_dword v0, v[0:1], off
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_perm_b32 v0, v0, v0, s0
@@ -4562,7 +4520,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX10-LABEL: shuffle_v4bf16_0000:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
; GFX10-NEXT: v_mov_b32_e32 v1, v0
@@ -4571,7 +4529,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_0000:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4581,7 +4539,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_0000:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5665,44 +5623,40 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
; GX900-LABEL: shuffle_v4bf16_0456:
; GX900: ; %bb.0:
; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT: global_load_dword v6, v[0:1], off
+; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GX900-NEXT: s_mov_b32 s4, 0x5040100
-; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1
-; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GX900-NEXT: s_waitcnt vmcnt(0)
-; GX900-NEXT: v_perm_b32 v0, v5, v4, s4
-; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT: v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: shuffle_v4bf16_0456:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off
+; GFX942-NEXT: global_load_dword v6, v[0:1], off
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX942-NEXT: s_mov_b32 s0, 0x5040100
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_perm_b32 v0, v6, v4, s0
-; GFX942-NEXT: v_alignbit_b32 v1, v7, v6, 16
+; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0
+; GFX942-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v4bf16_0456:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
-; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
-; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100
-; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16
+; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: shuffle_v4bf16_0456:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
-; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
@@ -5712,7 +5666,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-FAKE16-LABEL: shuffle_v4bf16_0456:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100