summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll166
1 files changed, 82 insertions, 84 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index fe7def8a6927..b01e92d6979a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -331,8 +331,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_35u5:
@@ -390,12 +389,9 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_357u:
@@ -1225,13 +1221,15 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_5734:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
-; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_5734:
@@ -1482,10 +1480,11 @@ define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_1100:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_1100:
@@ -1538,13 +1537,11 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: shuffle_v4f16_6161:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4
-; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1597,8 +1594,7 @@ define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_2333:
@@ -1647,8 +1643,7 @@ define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4f16_6667:
@@ -2190,13 +2185,13 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
; GFX942-LABEL: shuffle_scalar_load_v8i32_0123:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
@@ -2318,13 +2313,10 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX11-TRUE16-LABEL: hi16bits_v2f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: hi16bits_v2f16:
@@ -2373,14 +2365,23 @@ define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: low16hi16bits_v2f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: global_load_b32 v1, v[2:3], off
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: low16hi16bits_v2f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: low16hi16bits_v2f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = load <2 x half>, ptr addrspace(1) %x0, align 4
%1 = load <2 x half>, ptr addrspace(1) %x1, align 4
@@ -2520,14 +2521,23 @@ define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: i16_low16hi16bits:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: global_load_b32 v1, v[2:3], off
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: i16_low16hi16bits:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: i16_low16hi16bits:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
%1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
@@ -3617,8 +3627,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_35u5:
@@ -3676,12 +3685,9 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u:
@@ -4511,13 +4517,15 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_5734:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off
-; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_5734:
@@ -4671,10 +4679,11 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_1100:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_1100:
@@ -4727,13 +4736,11 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-LABEL: shuffle_v4bf16_6161:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4
-; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4786,8 +4793,7 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_2333:
@@ -4836,8 +4842,7 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shuffle_v4bf16_6667:
@@ -5533,13 +5538,9 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
; GFX11-TRUE16-NEXT: global_store_b64 v6, v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -5817,13 +5818,10 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
; GFX11-TRUE16-LABEL: hi16bits_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: hi16bits_v2bf16:
@@ -5875,10 +5873,10 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
; GFX11-TRUE16-LABEL: low16hi16bits_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off
; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: low16hi16bits_v2bf16: