diff options
| author | Mingming Liu <mingmingl@google.com> | 2025-09-10 15:25:31 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-10 15:25:31 -0700 |
| commit | 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch) | |
| tree | 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll | |
| parent | 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff) | |
| parent | b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff) | |
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll | 166 |
1 files changed, 82 insertions, 84 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index fe7def8a6927..b01e92d6979a 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -331,8 +331,7 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4f16_35u5: @@ -390,12 +389,9 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4f16_357u: @@ -1225,13 +1221,15 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_5734: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4f16_5734: @@ -1482,10 +1480,11 @@ define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_1100: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4f16_1100: @@ -1538,13 +1537,11 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_6161: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4 -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1597,8 +1594,7 @@ define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4f16_2333: @@ -1647,8 +1643,7 @@ define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4f16_6667: @@ -2190,13 +2185,13 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; GFX942-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] ; GFX942-NEXT: s_endpgm ; ; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: @@ -2318,13 +2313,10 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-LABEL: hi16bits_v2f16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: hi16bits_v2f16: @@ -2373,14 +2365,23 @@ define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: low16hi16bits_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: global_load_b32 v1, v[2:3], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: low16hi16bits_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: low16hi16bits_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x half>, ptr addrspace(1) %x0, align 4 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4 @@ -2520,14 +2521,23 @@ define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: i16_low16hi16bits: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: global_load_b32 v1, v[2:3], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: i16_low16hi16bits: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: i16_low16hi16bits: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-FAKE16-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4 @@ -3617,8 +3627,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4bf16_35u5: @@ -3676,12 +3685,9 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u: @@ -4511,13 +4517,15 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_5734: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4bf16_5734: @@ -4671,10 +4679,11 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_1100: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4bf16_1100: @@ -4727,13 +4736,11 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_6161: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off offset:4 -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4786,8 +4793,7 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4bf16_2333: @@ -4836,8 +4842,7 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4bf16_6667: @@ -5533,13 +5538,9 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h ; GFX11-TRUE16-NEXT: global_store_b64 v6, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -5817,13 +5818,10 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) ; GFX11-TRUE16-LABEL: hi16bits_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-TRUE16-NEXT: global_load_b32 v0, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: hi16bits_v2bf16: @@ -5875,10 +5873,10 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) ; GFX11-TRUE16-LABEL: low16hi16bits_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: low16hi16bits_v2bf16: |
