diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 39 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 38 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll | 21 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll | 11 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/mul_int24.ll | 49 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/sra.ll | 60 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/trunc.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll | 54 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll | 258 |
13 files changed, 269 insertions, 317 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 0a2e758f7cf2..50cc28810000 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -879,44 +879,43 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_load_dword s4, s[4:5], 0xe +; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_bfi_b32 v0, s5, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s3, s[4:5], 0x38 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s0, -2 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfi_b32 v2, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x38 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s3, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 8b5c34d97e50..674924e3a925 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -13,14 +13,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_load_dword s4, s[4:5], 0x1e +; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v1, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_bfi_b32 v1, s5, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -28,32 +28,32 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], ; VI-LABEL: s_test_copysign_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x74 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x78 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x74 -; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x78 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll index 5df61f19033a..d794dcf06f99 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll @@ -8,7 +8,7 @@ define amdgpu_gs i32 @vgpr16_copyto_sgpr() { ; CHECK-LABEL: vgpr16_copyto_sgpr: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: v_mov_b32_e32 v0, lds@abs32@lo -; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; CHECK-NEXT: ds_load_b32 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll index 6e82a294243d..b1becd0409a3 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll +++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll @@ -19,31 +19,30 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; FORWARDXNACK-LABEL: shuffle_v4f16_234u: ; FORWARDXNACK: ; %bb.0: ; FORWARDXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FORWARDXNACK-NEXT: global_load_dword v6, v[0:1], off offset:4 -; FORWARDXNACK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; FORWARDXNACK-NEXT: global_load_dword v4, v[0:1], off offset:4 +; FORWARDXNACK-NEXT: global_load_dword v5, v[2:3], off ; FORWARDXNACK-NEXT: s_waitcnt vmcnt(1) -; FORWARDXNACK-NEXT: v_mov_b32_e32 v0, v6 +; FORWARDXNACK-NEXT: v_mov_b32_e32 v0, v4 ; FORWARDXNACK-NEXT: s_waitcnt vmcnt(0) -; FORWARDXNACK-NEXT: v_mov_b32_e32 v1, v4 +; FORWARDXNACK-NEXT: v_mov_b32_e32 v1, v5 ; FORWARDXNACK-NEXT: s_setpc_b64 s[30:31] ; ; REVERSEXNACK-LABEL: shuffle_v4f16_234u: ; REVERSEXNACK: ; %bb.0: ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2 -; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4 -; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off +; REVERSEXNACK-NEXT: global_load_dword v5, v[0:1], off offset:4 +; REVERSEXNACK-NEXT: global_load_dword v4, v[2:3], off +; REVERSEXNACK-NEXT: s_waitcnt vmcnt(1) +; REVERSEXNACK-NEXT: v_mov_b32_e32 v0, v5 ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) +; REVERSEXNACK-NEXT: v_mov_b32_e32 v1, v4 ; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31] ; ; NOXNACK-LABEL: shuffle_v4f16_234u: ; NOXNACK: ; %bb.0: ; NOXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOXNACK-NEXT: global_load_dword v0, v[0:1], off offset:4 -; NOXNACK-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; NOXNACK-NEXT: global_load_dword v1, v[2:3], off ; NOXNACK-NEXT: s_waitcnt vmcnt(0) ; NOXNACK-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 76f204dd0c16..8ad8a5405e11 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -29,21 +29,20 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v6, s67, 17 ; CHECK-NEXT: v_writelane_b32 v6, s68, 18 ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: v_writelane_b32 v6, s69, 19 -; CHECK-NEXT: s_mov_b32 s68, 0 ; CHECK-NEXT: s_mov_b32 s69, s4 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s68, 0 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx8 s[24:31], s[68:69], 0x30 ; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0xf0 -; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9 -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: ; kill: killed $sgpr4_sgpr5 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v6, s70, 20 ; CHECK-NEXT: v_writelane_b32 v6, s71, 21 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v7, s8, 0 ; CHECK-NEXT: v_writelane_b32 v7, s9, 1 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 2daed9b69384..88d60c5fac44 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -112,10 +112,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x40 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 @@ -124,10 +124,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc -; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xcc ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 @@ -166,10 +166,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x44 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 @@ -178,10 +178,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8 -; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 10d4eb029ee3..36dabd858c70 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -459,18 +459,18 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s5, s6, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_mul_i32 s4, s5, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_mul_i32 s5, s4, s5 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -478,11 +478,12 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s3, s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x180000 -; VI-NEXT: s_bfe_i32 s3, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s3, s3, 0x180000 ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0 ; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0 @@ -569,28 +570,28 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-NEXT: s_load_dword s7, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b64 s[6:7], s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x34 +; VI-NEXT: s_load_dword s7, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s7, v0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll index 8f3acece55ce..8bf14a013149 100644 --- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll +++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll @@ -16,7 +16,7 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { .entry: %tmp31 = sext i32 %arg18 to i64 %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) @indexable, i64 0, i64 %tmp31 - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } @@ -31,7 +31,7 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { %tmp1 = zext i32 %arg18 to i64 %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1) %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } @@ -46,7 +46,7 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) { %tmp1 = zext i32 %arg18 to i64 %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1) %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 1 - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } @@ -61,7 +61,7 @@ define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) { %tmp1 = zext i32 %arg18 to i64 %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1) %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 80c0d0f45eb9..508bd78785b6 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -830,16 +830,16 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_ashr_33_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_33_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dword s6, s[4:5], 0x14 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s6, s7, 31 -; SI-NEXT: s_ashr_i32 s7, s7, 1 -; SI-NEXT: s_add_u32 s4, s7, s4 -; SI-NEXT: s_addc_u32 s5, s6, s5 +; SI-NEXT: s_ashr_i32 s7, s6, 31 +; SI-NEXT: s_ashr_i32 s6, s6, 1 +; SI-NEXT: s_add_u32 s4, s6, s4 +; SI-NEXT: s_addc_u32 s5, s7, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -847,16 +847,16 @@ define amdgpu_kernel void @s_ashr_33_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_33_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dword s6, s[4:5], 0x50 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s7, 31 -; VI-NEXT: s_ashr_i32 s7, s7, 1 -; VI-NEXT: s_add_u32 s4, s7, s4 -; VI-NEXT: s_addc_u32 s5, s6, s5 +; VI-NEXT: s_ashr_i32 s7, s6, 31 +; VI-NEXT: s_ashr_i32 s6, s6, 1 +; VI-NEXT: s_add_u32 s4, s6, s4 +; VI-NEXT: s_addc_u32 s5, s7, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -953,16 +953,16 @@ define amdgpu_kernel void @v_ashr_33_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_ashr_62_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_62_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dword s6, s[4:5], 0x14 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s6, s7, 31 -; SI-NEXT: s_ashr_i32 s7, s7, 30 -; SI-NEXT: s_add_u32 s4, s7, s4 -; SI-NEXT: s_addc_u32 s5, s6, s5 +; SI-NEXT: s_ashr_i32 s7, s6, 31 +; SI-NEXT: s_ashr_i32 s6, s6, 30 +; SI-NEXT: s_add_u32 s4, s6, s4 +; SI-NEXT: s_addc_u32 s5, s7, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -970,16 +970,16 @@ define amdgpu_kernel void @s_ashr_62_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_62_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; VI-NEXT: s_load_dword s6, s[4:5], 0x50 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s7, 31 -; VI-NEXT: s_ashr_i32 s7, s7, 30 -; VI-NEXT: s_add_u32 s4, s7, s4 -; VI-NEXT: s_addc_u32 s5, s6, s5 +; VI-NEXT: s_ashr_i32 s7, s6, 31 +; VI-NEXT: s_ashr_i32 s6, s6, 30 +; VI-NEXT: s_add_u32 s4, s6, s4 +; VI-NEXT: s_addc_u32 s5, s7, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1077,15 +1077,15 @@ define amdgpu_kernel void @v_ashr_62_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x1d +; SI-NEXT: s_load_dword s8, s[4:5], 0x14 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s5, s7, 31 -; SI-NEXT: s_add_u32 s4, s5, s8 -; SI-NEXT: s_addc_u32 s5, s5, s9 +; SI-NEXT: s_ashr_i32 s5, s8, 31 +; SI-NEXT: s_add_u32 s4, s5, s6 +; SI-NEXT: s_addc_u32 s5, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1093,15 +1093,15 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x74 +; VI-NEXT: s_load_dword s8, s[4:5], 0x50 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x74 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s7, 31 -; VI-NEXT: s_add_u32 s4, s5, s8 -; VI-NEXT: s_addc_u32 s5, s5, s9 +; VI-NEXT: s_ashr_i32 s5, s8, 31 +; VI-NEXT: s_add_u32 s4, s5, s6 +; VI-NEXT: s_addc_u32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index 76f60f1e5dbf..08f46b458621 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -374,7 +374,7 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) { ; SI-LABEL: s_trunc_i64_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -387,14 +387,14 @@ define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i ; ; VI-LABEL: s_trunc_i64_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s0, 0 -; VI-NEXT: s_cselect_b32 s0, 63, -12 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bitcmp1_b32 s2, 0 +; VI-NEXT: s_cselect_b32 s2, 63, -12 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll index 8af4a8de7b26..06765dcffe4b 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll @@ -71,12 +71,12 @@ define <3 x i32> @test_add3x32(ptr %a_ptr, ptr %b_ptr) { ; CHECK-LABEL: test_add3x32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dword v4, v[2:3] -; CHECK-NEXT: flat_load_dword v5, v[0:1] +; CHECK-NEXT: flat_load_dword v4, v[0:1] +; CHECK-NEXT: flat_load_dword v5, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v1, 48 ; CHECK-NEXT: v_mov_b32_e32 v2, 48 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v0, v5, v4 +; CHECK-NEXT: v_or_b32_e32 v0, v4, v5 ; CHECK-NEXT: s_setpc_b64 s[30:31] %a = load <3 x i32>, ptr %a_ptr, !range !2, !noundef !{} %b = load <3 x i32>, ptr %b_ptr, !range !3, !noundef !{} diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll index 504554037c53..ec2aa86a9505 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll @@ -1847,25 +1847,25 @@ define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v2f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <2 x float>, ptr addrspace(1) %arg0 @@ -1877,28 +1877,28 @@ define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v3f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v3f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v3f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <3 x float>, ptr addrspace(1) %arg0 @@ -1910,31 +1910,31 @@ define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v4f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <4 x float>, ptr addrspace(1) %arg0 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 6bf6d540299f..1faf7763699c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -33,44 +33,33 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) % } define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { -; GX900-LABEL: shuffle_v4f16_234u: -; GX900: ; %bb.0: -; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_mov_b32_e32 v0, v6 -; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_mov_b32_e32 v1, v4 -; GX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: shuffle_v4f16_234u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: shuffle_v4f16_234u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_234u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_234u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 @@ -352,36 +341,33 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GX900-LABEL: shuffle_v4f16_357u: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x7060302 -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: shuffle_v4f16_357u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX942-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX942-NEXT: s_mov_b32 s0, 0x7060302 -; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: shuffle_v4f16_357u: @@ -397,12 +383,11 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_357u: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 +; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 %val1 = load <4 x half>, ptr addrspace(1) %arg1 @@ -1082,23 +1067,21 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: shuffle_v4f16_3456: @@ -1117,12 +1100,11 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_3456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 %val1 = load <4 x half>, ptr addrspace(1) %arg1 @@ -1134,12 +1116,11 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_5634: @@ -1348,7 +1329,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GX900-LABEL: shuffle_v4f16_0000: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GX900-NEXT: global_load_dword v0, v[0:1], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v0, v0, s4 @@ -1358,7 +1339,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX942-LABEL: shuffle_v4f16_0000: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX942-NEXT: global_load_dword v0, v[0:1], off ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_perm_b32 v0, v0, v0, s0 @@ -1368,7 +1349,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_0000: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1377,7 +1358,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_0000: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1387,7 +1368,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_0000: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2107,44 +2088,40 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GX900-LABEL: shuffle_v4f16_0456: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GX900-NEXT: global_load_dword v6, v[0:1], off +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 -; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 -; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: shuffle_v4f16_0456: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX942-NEXT: global_load_dword v6, v[0:1], off +; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_perm_b32 v0, v6, v4, s0 -; GFX942-NEXT: v_alignbit_b32 v1, v7, v6, 16 +; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX942-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_0456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off -; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: shuffle_v4f16_0456: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h @@ -2154,7 +2131,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-FAKE16-LABEL: shuffle_v4f16_0456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 @@ -3323,44 +3300,33 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1 } define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { -; GX900-LABEL: shuffle_v4bf16_234u: -; GX900: ; %bb.0: -; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_mov_b32_e32 v0, v6 -; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_mov_b32_e32 v1, v4 -; GX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: shuffle_v4bf16_234u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: shuffle_v4bf16_234u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_234u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4bf16_234u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 @@ -3642,36 +3608,33 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GX900-LABEL: shuffle_v4bf16_357u: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x7060302 -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: shuffle_v4bf16_357u: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX942-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX942-NEXT: s_mov_b32 s0, 0x7060302 -; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX942-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: shuffle_v4bf16_357u: @@ -3687,12 +3650,11 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 +; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 @@ -4372,23 +4334,21 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX9-LABEL: shuffle_v4bf16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: shuffle_v4bf16_3456: @@ -4407,12 +4367,11 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_3456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 @@ -4424,12 +4383,11 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX9-LABEL: shuffle_v4bf16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_5634: @@ -4542,7 +4500,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GX900-LABEL: shuffle_v4bf16_0000: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GX900-NEXT: global_load_dword v0, v[0:1], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v0, v0, s4 @@ -4552,7 +4510,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX942-LABEL: shuffle_v4bf16_0000: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX942-NEXT: global_load_dword v0, v[0:1], off ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_perm_b32 v0, v0, v0, s0 @@ -4562,7 +4520,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_0000: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -4571,7 +4529,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_0000: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4581,7 +4539,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_0000: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5665,44 +5623,40 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GX900-LABEL: shuffle_v4bf16_0456: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GX900-NEXT: global_load_dword v6, v[0:1], off +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 -; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 -; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: shuffle_v4bf16_0456: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX942-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX942-NEXT: global_load_dword v6, v[0:1], off +; GFX942-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_perm_b32 v0, v6, v4, s0 -; GFX942-NEXT: v_alignbit_b32 v1, v7, v6, 16 +; GFX942-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX942-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_0456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off -; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: shuffle_v4bf16_0456: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h @@ -5712,7 +5666,7 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-FAKE16-LABEL: shuffle_v4bf16_0456: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-FAKE16-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 |
