diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll | 200 |
1 files changed, 82 insertions, 118 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index fe7d421d27f8..4598bcc04a50 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -910,9 +910,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 @@ -930,11 +929,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1089,9 +1087,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1106,9 +1103,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1180,9 +1176,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1197,9 +1192,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1269,12 +1263,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1292,11 +1285,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1363,10 +1355,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1384,11 +1375,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1455,10 +1445,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_not_b32_e32 v1, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1476,11 +1465,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1683,19 +1671,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-LABEL: insertelement_v_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_lshr_b32 s0, s3, 2 +; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 +; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] @@ -1709,19 +1696,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 2 +; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_and_b32 s2, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -1953,8 +1939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1980,8 +1965,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2091,8 +2075,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2109,17 +2092,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2219,16 +2201,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2245,16 +2226,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_not_b32_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2342,15 +2322,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: s_lshr_b32 s0, s2, 2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -2364,19 +2343,18 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -2464,16 +2442,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX8-NEXT: v_not_b32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2490,16 +2467,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2773,14 +2749,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-LABEL: insertelement_v_v16i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 +; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_lshl_b32 s5, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s6, s0 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s6, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 @@ -2789,7 +2764,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6 ; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] @@ -2805,14 +2780,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 +; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_and_b32 s1, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s6, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2821,7 +2795,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] @@ -3126,17 +3100,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v3, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 @@ -3157,23 +3130,22 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 @@ -3304,23 +3276,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0xff ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 0xff +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_not_b32_e32 v1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3341,23 +3312,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_not_b32_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -3491,7 +3461,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 @@ -3499,7 +3468,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc @@ -3521,9 +3490,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -3531,7 +3499,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc @@ -3636,13 +3604,12 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-LABEL: insertelement_v_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s5, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3652,7 +3619,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] @@ -3668,14 +3635,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -3684,7 +3650,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] @@ -3798,7 +3764,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 @@ -3806,7 +3771,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc @@ -3822,15 +3787,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -3838,7 +3802,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc |
