summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll282
1 files changed, 117 insertions, 165 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index cae833b0d64e..0e1bbbd1ea92 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -123,9 +123,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, s1, v2
@@ -143,11 +142,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -302,9 +300,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -319,9 +316,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_and_b32 s1, s4, 0xffff
; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -393,9 +389,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -410,9 +405,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -482,12 +476,11 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
-; GFX8-NEXT: v_not_b32_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, v0, v1
+; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -505,11 +498,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -576,10 +568,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -597,11 +588,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -668,10 +658,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -689,11 +678,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -820,19 +808,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-LABEL: insertelement_v_v4i16_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_and_b32 s1, s3, 1
; GFX8-NEXT: s_lshr_b32 s0, s3, 1
+; GFX8-NEXT: s_and_b32 s1, s3, 1
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 4
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX8-NEXT: s_lshl_b32 s2, s2, s1
; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s1, v4
+; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4
; GFX8-NEXT: v_or_b32_e32 v4, s2, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
@@ -846,19 +833,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s3, 1
; GFX7-NEXT: s_lshr_b32 s0, s3, 1
+; GFX7-NEXT: s_and_b32 s1, s3, 1
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 4
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: s_lshl_b32 s2, s2, s1
; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v2, s2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1090,8 +1076,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1117,8 +1102,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -1228,8 +1212,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1246,17 +1229,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: v_mov_b32_e32 v4, s1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -1356,16 +1338,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1382,16 +1363,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: s_and_b32 s0, s2, 0xffff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1479,15 +1459,14 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: s_lshr_b32 s0, s2, 1
; GFX8-NEXT: s_lshl_b32 s1, s1, 4
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, s1, v5
+; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1501,19 +1480,18 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s2, 1
; GFX7-NEXT: s_lshr_b32 s0, s2, 1
+; GFX7-NEXT: s_and_b32 s1, s2, 1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s1, s1, 4
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2
; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1601,16 +1579,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT: v_not_b32_e32 v3, v3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1627,16 +1604,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1910,14 +1886,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-LABEL: insertelement_v_v8i16_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s3, 1
; GFX8-NEXT: s_lshr_b32 s4, s3, 1
+; GFX8-NEXT: s_and_b32 s0, s3, 1
; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshl_b32 s5, s1, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s6, s0
+; GFX8-NEXT: s_lshl_b32 s5, s1, s0
+; GFX8-NEXT: s_lshl_b32 s6, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
@@ -1926,7 +1901,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6
; GFX8-NEXT: v_or_b32_e32 v6, s5, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
@@ -1942,14 +1917,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s3, 1
; GFX7-NEXT: s_lshr_b32 s4, s3, 1
+; GFX7-NEXT: s_and_b32 s0, s3, 1
; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshl_b32 s5, s1, s0
-; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s6, s0
+; GFX7-NEXT: s_lshl_b32 s5, s1, s0
+; GFX7-NEXT: s_lshl_b32 s6, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -1958,7 +1932,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4
; GFX7-NEXT: v_or_b32_e32 v4, s5, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
@@ -2263,17 +2237,16 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
@@ -2294,23 +2267,22 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
; GFX7-NEXT: v_mov_b32_e32 v5, s11
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
@@ -2441,23 +2413,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2478,23 +2449,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -2628,7 +2598,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v7, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v8, 0
@@ -2636,7 +2605,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
@@ -2658,9 +2627,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -2668,7 +2636,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v7, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7
; GFX7-NEXT: v_or_b32_e32 v7, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
@@ -2773,13 +2741,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-LABEL: insertelement_v_v8i16_v_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s2, 1
; GFX8-NEXT: s_lshr_b32 s4, s2, 1
+; GFX8-NEXT: s_and_b32 s0, s2, 1
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s5, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -2789,7 +2756,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
@@ -2805,14 +2772,13 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s2, 1
; GFX7-NEXT: s_lshr_b32 s4, s2, 1
+; GFX7-NEXT: s_and_b32 s0, s2, 1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
-; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s5, s0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -2821,7 +2787,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
@@ -2935,7 +2901,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v9, 0
@@ -2943,7 +2908,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v3, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -2959,15 +2924,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -2975,7 +2939,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -3283,19 +3247,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: s_and_b32 s0, s3, 1
+; GFX8-NEXT: s_lshr_b32 m0, s3, 1
; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshr_b32 m0, s3, 1
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_mov_b32_e32 v9, 0
; GFX8-NEXT: v_mov_b32_e32 v10, 16
; GFX8-NEXT: v_mov_b32_e32 v11, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_movrels_b32_e32 v12, v0
-; GFX8-NEXT: v_and_b32_e32 v12, s0, v12
+; GFX8-NEXT: v_bfi_b32 v12, s0, 0, v12
; GFX8-NEXT: v_or_b32_e32 v12, s1, v12
; GFX8-NEXT: v_movreld_b32_e32 v0, v12
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -3310,17 +3273,16 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_and_b32 s0, s3, 1
+; GFX7-NEXT: s_lshr_b32 m0, s3, 1
; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshr_b32 m0, s3, 1
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_movrels_b32_e32 v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: v_movreld_b32_e32 v2, v0
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0
@@ -3644,21 +3606,20 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX8-NEXT: v_mov_b32_e32 v6, s21
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v7, s22
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v9, s23
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
@@ -3705,20 +3666,19 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: v_mov_b32_e32 v6, s21
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v7, s22
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v9, s23
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v9, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s16
; GFX7-NEXT: v_mov_b32_e32 v1, s17
@@ -3936,20 +3896,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX8-NEXT: v_mov_b32_e32 v7, s17
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v9, s18
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v10, s19
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
@@ -3996,20 +3955,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: v_mov_b32_e32 v7, s17
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mov_b32_e32 v9, s18
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mov_b32_e32 v10, s19
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v9, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
@@ -4216,7 +4174,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v11, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v12, 0
@@ -4231,7 +4188,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v15, v0, v15
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
@@ -4263,9 +4220,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX7-NEXT: s_mov_b64 s[16:17], 0
; GFX7-NEXT: s_mov_b32 s18, -1
@@ -4278,7 +4234,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7]
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9]
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11]
-; GFX7-NEXT: v_and_b32_e32 v1, v11, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v11
; GFX7-NEXT: v_or_b32_e32 v11, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
@@ -4452,14 +4408,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_mov_b32_e32 v13, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v11, 16
; GFX8-NEXT: v_mov_b32_e32 v12, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_movrels_b32_e32 v13, v3
-; GFX8-NEXT: v_and_b32_e32 v13, s0, v13
+; GFX8-NEXT: v_bfi_b32 v13, s0, 0, v13
; GFX8-NEXT: v_or_b32_e32 v2, v13, v2
; GFX8-NEXT: v_movreld_b32_e32 v3, v2
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
@@ -4474,17 +4429,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_and_b32 s0, s2, 1
+; GFX7-NEXT: s_lshr_b32 m0, s2, 1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshr_b32 m0, s2, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_movrels_b32_e32 v1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, s0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_movreld_b32_e32 v3, v0
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
@@ -4611,7 +4565,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v12, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v13, 0
@@ -4626,7 +4579,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v16, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc
@@ -4654,13 +4607,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX7-NEXT: s_mov_b64 s[16:17], 0
; GFX7-NEXT: s_mov_b32 s18, -1
@@ -4673,7 +4625,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v12, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc