summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll200
1 files changed, 82 insertions, 118 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index fe7d421d27f8..4598bcc04a50 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -910,9 +910,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, s1, v2
@@ -930,11 +929,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1089,9 +1087,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1106,9 +1103,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_and_b32 s1, s4, 0xff
; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -1180,9 +1176,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1197,9 +1192,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -1269,12 +1263,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: s_and_b32 s0, s2, 0xff
-; GFX8-NEXT: v_not_b32_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, v0, v1
+; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1292,11 +1285,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1363,10 +1355,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1384,11 +1375,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1455,10 +1445,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1476,11 +1465,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -1683,19 +1671,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-LABEL: insertelement_v_v8i8_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_and_b32 s1, s3, 3
; GFX8-NEXT: s_lshr_b32 s0, s3, 2
+; GFX8-NEXT: s_and_b32 s1, s3, 3
; GFX8-NEXT: s_and_b32 s2, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 3
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX8-NEXT: s_lshl_b32 s2, s2, s1
; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s1, v4
+; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4
; GFX8-NEXT: v_or_b32_e32 v4, s2, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
@@ -1709,19 +1696,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s3, 3
; GFX7-NEXT: s_lshr_b32 s0, s3, 2
+; GFX7-NEXT: s_and_b32 s1, s3, 3
; GFX7-NEXT: s_and_b32 s2, s2, 0xff
; GFX7-NEXT: s_lshl_b32 s1, s1, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: s_lshl_b32 s2, s2, s1
; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v2, s2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1953,8 +1939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1980,8 +1965,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2091,8 +2075,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -2109,17 +2092,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1
; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: v_mov_b32_e32 v4, s1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2219,16 +2201,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_and_b32 s0, s2, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2245,16 +2226,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
; GFX7-NEXT: s_and_b32 s0, s2, 0xff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2342,15 +2322,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: s_lshr_b32 s0, s2, 2
; GFX8-NEXT: s_lshl_b32 s1, s1, 3
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, s1, v5
+; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -2364,19 +2343,18 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s2, 3
; GFX7-NEXT: s_lshr_b32 s0, s2, 2
+; GFX7-NEXT: s_and_b32 s1, s2, 3
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: s_lshl_b32 s1, s1, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2
; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -2464,16 +2442,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
; GFX8-NEXT: v_mov_b32_e32 v6, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT: v_not_b32_e32 v3, v3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2490,16 +2467,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2773,14 +2749,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-LABEL: insertelement_v_v16i8_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s3, 3
; GFX8-NEXT: s_lshr_b32 s4, s3, 2
+; GFX8-NEXT: s_and_b32 s0, s3, 3
; GFX8-NEXT: s_and_b32 s1, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: s_lshl_b32 s5, s1, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s6, s0
+; GFX8-NEXT: s_lshl_b32 s5, s1, s0
+; GFX8-NEXT: s_lshl_b32 s6, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
@@ -2789,7 +2764,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6
; GFX8-NEXT: v_or_b32_e32 v6, s5, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
@@ -2805,14 +2780,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s3, 3
; GFX7-NEXT: s_lshr_b32 s4, s3, 2
+; GFX7-NEXT: s_and_b32 s0, s3, 3
; GFX7-NEXT: s_and_b32 s1, s2, 0xff
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
-; GFX7-NEXT: s_lshl_b32 s5, s1, s0
-; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s6, s0
+; GFX7-NEXT: s_lshl_b32 s5, s1, s0
+; GFX7-NEXT: s_lshl_b32 s6, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -2821,7 +2795,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4
; GFX7-NEXT: v_or_b32_e32 v4, s5, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
@@ -3126,17 +3100,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_and_b32 s4, s4, 0xff
; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
@@ -3157,23 +3130,22 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xff
; GFX7-NEXT: v_mov_b32_e32 v5, s11
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
@@ -3304,23 +3276,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3341,23 +3312,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -3491,7 +3461,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v7, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v8, 0
@@ -3499,7 +3468,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
@@ -3521,9 +3490,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -3531,7 +3499,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v7, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7
; GFX7-NEXT: v_or_b32_e32 v7, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
@@ -3636,13 +3604,12 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-LABEL: insertelement_v_v16i8_v_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s2, 3
; GFX8-NEXT: s_lshr_b32 s4, s2, 2
+; GFX8-NEXT: s_and_b32 s0, s2, 3
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s5, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b32 s5, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -3652,7 +3619,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
@@ -3668,14 +3635,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s2, 3
; GFX7-NEXT: s_lshr_b32 s4, s2, 2
+; GFX7-NEXT: s_and_b32 s0, s2, 3
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
-; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s5, s0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_lshl_b32 s5, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -3684,7 +3650,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
@@ -3798,7 +3764,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v9, 0
@@ -3806,7 +3771,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v3, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -3822,15 +3787,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -3838,7 +3802,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc