summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/GlobalISel
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/GlobalISel')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll3647
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll1904
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll1686
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll282
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll200
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir932
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir116
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir116
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir118
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir118
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll199
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll429
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir77
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll105
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll96
56 files changed, 6783 insertions, 3860 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index e1ef3f9be0a5..aa38c63dc9dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -99,15 +99,13 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
; GCN-LABEL: v_andn2_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_and_b32_e32 v0, v0, v1
+; GCN-NEXT: v_bfi_b32 v0, v1, 0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_andn2_i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i32 %src1, -1
%and = and i32 %src0, %not.src1
@@ -117,14 +115,12 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
; GCN-LABEL: v_andn2_i32_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, s2, v0
+; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i32_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%and = and i32 %src0, %not.src1
@@ -135,14 +131,12 @@ define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
define amdgpu_ps float @v_andn2_i32_vs(i32 %src0, i32 inreg %src1) {
; GCN-LABEL: v_andn2_i32_vs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_not_b32 s0, s2
-; GCN-NEXT: v_and_b32_e32 v0, s0, v0
+; GCN-NEXT: v_bfi_b32 v0, s2, 0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i32_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_not_b32 s0, s2
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, 0, v0
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%and = and i32 %src0, %not.src1
@@ -247,19 +241,15 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v2, v2
-; GCN-NEXT: v_not_b32_e32 v3, v3
-; GCN-NEXT: v_and_b32_e32 v0, v0, v2
-; GCN-NEXT: v_and_b32_e32 v1, v1, v3
+; GCN-NEXT: v_bfi_b32 v0, v2, 0, v0
+; GCN-NEXT: v_bfi_b32 v1, v3, 0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_andn2_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
-; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2
-; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, 0, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, 0, v1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
@@ -269,18 +259,14 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_and_b32_e32 v0, s2, v0
-; GCN-NEXT: v_and_b32_e32 v1, s3, v1
+; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
+; GCN-NEXT: v_bfi_b32 v1, v1, 0, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i64_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
-; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, 0, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index e6e98fb6edf2..206011adf021 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 481a2540eacb..7e297f46a780 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -73,7 +73,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -192,7 +192,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -311,7 +311,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -429,7 +429,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -547,7 +547,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -666,7 +666,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -785,7 +785,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -903,7 +903,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -1021,7 +1021,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -1140,7 +1140,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -1259,7 +1259,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -1377,7 +1377,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 2226fd20fb77..302b2395642d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -7,12 +7,215 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: s_load_dword s3, s[4:5], 0x2
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT: v_cvt_f32_f16_e64 v0, |s3|
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v0
+; CI-NEXT: s_cbranch_vccz .LBB0_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0x8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v0
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB0_2: ; %Flow18
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB0_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v1, 11
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v0
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB0_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB0_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT: s_cbranch_vccnz .LBB0_5
+; CI-NEXT: s_branch .LBB0_7
+; CI-NEXT: .LBB0_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB0_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: s_and_b32 s4, s2, 0x8000
+; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; CI-NEXT: v_or_b32_e32 v1, s4, v0
+; CI-NEXT: .LBB0_8: ; %Flow19
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; CI-NEXT: v_cvt_f32_f16_e32 v2, 0
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff
+; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
+; CI-NEXT: s_cselect_b32 s2, 1, 0
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2
+; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT: s_and_b32 s2, 1, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s2, -1
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: frem_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT: ; implicit-def: $vgpr2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s0, s[10:11], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_f16_e64 v1, |s0|
+; VI-NEXT: v_cvt_f32_f16_e64 v0, |s1|
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v0
+; VI-NEXT: s_cbranch_vccz .LBB0_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s2, s0, 0x8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v1, v0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB0_2: ; %Flow18
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB0_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v2, v1
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v5
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_ldexp_f32 v4, v2, 11
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v3, v0
+; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB0_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB0_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT: s_cbranch_vccnz .LBB0_5
+; VI-NEXT: s_branch .LBB0_7
+; VI-NEXT: .LBB0_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB0_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: s_and_b32 s2, s0, 0x8000
+; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT: v_or_b32_e32 v2, s2, v0
+; VI-NEXT: .LBB0_8: ; %Flow19
+; VI-NEXT: v_mov_b32_e32 v0, 0x7c00
+; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s1, 0
+; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s0|, v0
+; VI-NEXT: v_mov_b32_e32 v0, 0x7e00
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: s_endpgm
+ %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
+ %r0 = load half, ptr addrspace(1) %in1, align 4
+ %r1 = load half, ptr addrspace(1) %gep2, align 4
+ %r2 = frem half %r0, %r1
+ store half %r2, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; CI-LABEL: fast_frem_f16:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s6, s[2:3], 0x0
+; CI-NEXT: s_load_dword s4, s[4:5], 0x2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
; CI-NEXT: v_rcp_f32_e32 v4, v2
@@ -27,15 +230,21 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
+; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_trunc_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_fma_f32 v0, v0, v1, v2
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; VI-LABEL: frem_f16:
+; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
@@ -65,33 +274,51 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
%r1 = load half, ptr addrspace(1) %gep2, align 4
- %r2 = frem half %r0, %r1
+ %r2 = frem fast half %r0, %r1
store half %r2, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
-; CI-LABEL: fast_frem_f16:
+define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
+; CI-LABEL: unsafe_frem_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: s_load_dword s3, s[4:5], 0x2
+; CI-NEXT: s_load_dword s6, s[2:3], 0x0
+; CI-NEXT: s_load_dword s4, s[4:5], 0x2
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
+; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
+; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; CI-NEXT: v_fma_f32 v4, v5, v4, v4
+; CI-NEXT: v_mul_f32_e32 v5, v3, v4
+; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v5, v6, v4, v5
+; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_rcp_f32_e32 v2, v1
-; CI-NEXT: v_mul_f32_e32 v2, v0, v2
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
+; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_trunc_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_fma_f32 v0, v0, v1, v2
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; VI-LABEL: fast_frem_f16:
+; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
@@ -99,11 +326,21 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_load_dword s3, s[4:5], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s3
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_rcp_f32_e32 v3, v2
+; VI-NEXT: v_mul_f32_e32 v4, v0, v3
+; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
+; VI-NEXT: v_mac_f32_e32 v4, v5, v3
+; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
+; VI-NEXT: v_mul_f32_e32 v0, v0, v3
+; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v4
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s3, v1
+; VI-NEXT: v_fma_f16 v2, -v0, v1, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
@@ -111,59 +348,209 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
%r1 = load half, ptr addrspace(1) %gep2, align 4
- %r2 = frem fast half %r0, %r1
+ %r2 = frem afn half %r0, %r1
store half %r2, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
-; CI-LABEL: unsafe_frem_f16:
+define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; CI-LABEL: frem_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: s_load_dword s3, s[4:5], 0x2
+; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v0, s3
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB3_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB3_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB3_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB3_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB3_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT: s_cbranch_vccnz .LBB3_5
+; CI-NEXT: s_branch .LBB3_7
+; CI-NEXT: .LBB3_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB3_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; CI-NEXT: s_and_b32 s4, s2, 0x80000000
+; CI-NEXT: v_or_b32_e32 v0, s4, v0
+; CI-NEXT: .LBB3_8: ; %Flow17
+; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s3, 0
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v2
+; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_rcp_f32_e32 v2, v1
-; CI-NEXT: v_mul_f32_e32 v2, v0, v2
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; VI-LABEL: unsafe_frem_f16:
+; VI-LABEL: frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_load_dword s3, s[4:5], 0x8
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_mov_b32 s4, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s3
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s3, v1
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB3_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s4, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: .LBB3_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s4, s4, 1
+; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cbranch_scc1 .LBB3_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB3_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB3_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT: s_cbranch_vccnz .LBB3_5
+; VI-NEXT: s_branch .LBB3_7
+; VI-NEXT: .LBB3_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB3_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; VI-NEXT: s_and_b32 s4, s2, 0x80000000
+; VI-NEXT: v_or_b32_e32 v0, s4, v0
+; VI-NEXT: .LBB3_8: ; %Flow17
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s3, 0
+; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; VI-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
- %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
- %r0 = load half, ptr addrspace(1) %in1, align 4
- %r1 = load half, ptr addrspace(1) %gep2, align 4
- %r2 = frem afn half %r0, %r1
- store half %r2, ptr addrspace(1) %out, align 4
+ %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
+ %r0 = load float, ptr addrspace(1) %in1, align 4
+ %r1 = load float, ptr addrspace(1) %gep2, align 4
+ %r2 = frem float %r0, %r1
+ store float %r2, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
-; CI-LABEL: frem_f32:
+define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; CI-LABEL: fast_frem_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
@@ -192,7 +579,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; VI-LABEL: frem_f32:
+; VI-LABEL: fast_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
@@ -223,43 +610,65 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
%r1 = load float, ptr addrspace(1) %gep2, align 4
- %r2 = frem float %r0, %r1
+ %r2 = frem fast float %r0, %r1
store float %r2, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
-; CI-LABEL: fast_frem_f32:
+define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
+; CI-LABEL: unsafe_frem_f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_load_dword s6, s[2:3], 0x0
+; CI-NEXT: s_load_dword s2, s[4:5], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: v_rcp_f32_e32 v0, s3
-; CI-NEXT: v_mul_f32_e32 v0, s2, v0
-; CI-NEXT: v_trunc_f32_e32 v0, v0
-; CI-NEXT: v_fma_f32 v0, -v0, s3, v1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6
+; CI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
+; CI-NEXT: v_rcp_f32_e32 v3, v1
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; CI-NEXT: v_fma_f32 v3, v4, v3, v3
+; CI-NEXT: v_mul_f32_e32 v4, v2, v3
+; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
+; CI-NEXT: v_fma_f32 v4, v5, v3, v4
+; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s6
+; CI-NEXT: v_trunc_f32_e32 v1, v1
+; CI-NEXT: v_fma_f32 v0, -v1, v0, s6
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; VI-LABEL: fast_frem_f32:
+; VI-LABEL: unsafe_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_load_dword s6, s[2:3], 0x0
+; VI-NEXT: s_load_dword s2, s[4:5], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s3
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
-; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s3, v1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6
+; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
+; VI-NEXT: v_rcp_f32_e32 v3, v1
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; VI-NEXT: v_fma_f32 v3, v4, v3, v3
+; VI-NEXT: v_mul_f32_e32 v4, v2, v3
+; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
+; VI-NEXT: v_fma_f32 v4, v5, v3, v4
+; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6
+; VI-NEXT: v_trunc_f32_e32 v1, v1
+; VI-NEXT: v_fma_f32 v2, -v1, v0, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -267,57 +676,238 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
%r1 = load float, ptr addrspace(1) %gep2, align 4
- %r2 = frem fast float %r0, %r1
+ %r2 = frem afn float %r0, %r1
store float %r2, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
-; CI-LABEL: unsafe_frem_f32:
+define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; CI-LABEL: frem_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: v_rcp_f32_e32 v0, s3
-; CI-NEXT: v_mul_f32_e32 v0, s2, v0
-; CI-NEXT: v_trunc_f32_e32 v0, v0
-; CI-NEXT: v_fma_f32 v0, -v0, s3, v1
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB6_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_brev_b32 s7, 1
+; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT: v_mov_b32_e32 v0, s6
+; CI-NEXT: v_mov_b32_e32 v1, s7
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB6_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB6_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB6_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT: .LBB6_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT: s_cbranch_vccnz .LBB6_5
+; CI-NEXT: s_branch .LBB6_7
+; CI-NEXT: .LBB6_6:
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: .LBB6_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_brev_b32 s7, 1
+; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT: v_or_b32_e32 v0, s6, v0
+; CI-NEXT: v_or_b32_e32 v1, s7, v1
+; CI-NEXT: .LBB6_8: ; %Flow17
+; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[4:5], 0
+; CI-NEXT: v_mov_b32_e32 v2, 0x7ff80000
+; CI-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc
+; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: v_mov_b32_e32 v1, 0x7ff00000
+; CI-NEXT: v_cmp_nge_f64_e64 vcc, |s[2:3]|, v[0:1]
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; VI-LABEL: unsafe_frem_f32:
+; VI-LABEL: frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s3
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
-; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s3, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB6_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: s_brev_b32 s7, 1
+; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB6_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB6_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB6_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT: .LBB6_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT: s_cbranch_vccnz .LBB6_5
+; VI-NEXT: s_branch .LBB6_7
+; VI-NEXT: .LBB6_6:
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: .LBB6_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: s_brev_b32 s7, 1
+; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; VI-NEXT: v_or_b32_e32 v0, s6, v0
+; VI-NEXT: v_or_b32_e32 v1, s7, v1
+; VI-NEXT: .LBB6_8: ; %Flow17
+; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[4:5], 0
+; VI-NEXT: v_mov_b32_e32 v2, 0x7ff80000
+; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: v_mov_b32_e32 v1, 0x7ff00000
+; VI-NEXT: v_cmp_nge_f64_e64 vcc, |s[2:3]|, v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
- %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
- %r0 = load float, ptr addrspace(1) %in1, align 4
- %r1 = load float, ptr addrspace(1) %gep2, align 4
- %r2 = frem afn float %r0, %r1
- store float %r2, ptr addrspace(1) %out, align 4
+ %r0 = load double, ptr addrspace(1) %in1, align 8
+ %r1 = load double, ptr addrspace(1) %in2, align 8
+ %r2 = frem double %r0, %r1
+ store double %r2, ptr addrspace(1) %out, align 8
ret void
}
-define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
-; CI-LABEL: frem_f64:
+define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; CI-LABEL: fast_frem_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
@@ -345,7 +935,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; VI-LABEL: frem_f64:
+; VI-LABEL: fast_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
@@ -374,63 +964,6 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: s_endpgm
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
- %r2 = frem double %r0, %r1
- store double %r2, ptr addrspace(1) %out, align 8
- ret void
-}
-
-define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
-; CI-LABEL: fast_frem_f64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
-; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: fast_frem_f64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %r0 = load double, ptr addrspace(1) %in1, align 8
- %r1 = load double, ptr addrspace(1) %in2, align 8
%r2 = frem fast double %r0, %r1
store double %r2, ptr addrspace(1) %out, align 8
ret void
@@ -445,20 +978,23 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
+; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
+; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
+; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
+; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -470,18 +1006,21 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
+; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
+; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
+; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -497,102 +1036,372 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
; CI-LABEL: frem_v2f16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd
+; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_load_dword s0, s[10:11], 0x0
+; CI-NEXT: s_load_dword s1, s[2:3], 0x4
+; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
-; CI-NEXT: s_lshr_b32 s4, s2, 16
-; CI-NEXT: s_lshr_b32 s5, s3, 16
-; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT: s_cbranch_vccz .LBB9_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s2, s0, 0x8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v3, s0
+; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB9_2: ; %Flow57
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB9_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s5
-; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB9_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB9_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT: s_cbranch_vccnz .LBB9_5
+; CI-NEXT: s_branch .LBB9_7
+; CI-NEXT: .LBB9_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB9_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v1
-; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
-; CI-NEXT: v_rcp_f32_e32 v5, v3
+; CI-NEXT: s_and_b32 s2, s0, 0x8000
+; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; CI-NEXT: v_or_b32_e32 v0, s2, v0
+; CI-NEXT: .LBB9_8: ; %Flow58
+; CI-NEXT: s_lshr_b32 s2, s0, 16
+; CI-NEXT: s_lshr_b32 s3, s1, 16
+; CI-NEXT: v_cvt_f32_f16_e64 v3, |s2|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s3|
+; CI-NEXT: s_mov_b32 s4, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; CI-NEXT: s_cbranch_vccz .LBB9_10
+; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: s_and_b32 s4, s2, 0x8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_mov_b32_e32 v4, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB9_10: ; %Flow53
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB9_16
+; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
+; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT: v_fma_f32 v5, v6, v5, v5
-; CI-NEXT: v_mul_f32_e32 v6, v4, v5
-; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT: v_fma_f32 v6, v7, v5, v6
-; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
-; CI-NEXT: v_trunc_f32_e32 v3, v3
-; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB9_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB9_13: ; %frem.loop_body27
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT: s_cbranch_vccnz .LBB9_13
+; CI-NEXT: s_branch .LBB9_15
+; CI-NEXT: .LBB9_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: s_and_b32 s4, s2, 0x8000
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; CI-NEXT: v_or_b32_e32 v1, s4, v1
+; CI-NEXT: .LBB9_16: ; %Flow54
+; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, 0
+; CI-NEXT: s_and_b32 s0, s0, 0x7fff
+; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00
+; CI-NEXT: s_cselect_b32 s4, 1, 0
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff
+; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
+; CI-NEXT: s_cselect_b32 s2, 1, 0
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_mov_b32_e32 v2, 0x7e00
+; CI-NEXT: s_and_b32 s3, 1, s4
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; CI-NEXT: s_and_b32 s0, 1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
-; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT: s_mov_b32 s10, -1
+; CI-NEXT: s_mov_b32 s11, 0xf000
+; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_load_dword s0, s[10:11], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
+; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v2
-; VI-NEXT: v_mul_f32_e32 v4, v0, v3
-; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT: v_mac_f32_e32 v4, v5, v3
-; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT: v_mul_f32_e32 v0, v0, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT: v_add_f32_e32 v0, v0, v4
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s0|
+; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; VI-NEXT: s_cbranch_vccz .LBB9_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s2, s0, 0x8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB9_2: ; %Flow57
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB9_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; VI-NEXT: v_ldexp_f32 v1, v3, 1
+; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 11
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB9_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB9_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT: s_cbranch_vccnz .LBB9_5
+; VI-NEXT: s_branch .LBB9_7
+; VI-NEXT: .LBB9_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB9_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s5
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; VI-NEXT: v_rcp_f32_e32 v4, v3
-; VI-NEXT: v_mul_f32_e32 v5, v1, v4
-; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
-; VI-NEXT: v_mac_f32_e32 v5, v6, v4
-; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
-; VI-NEXT: v_mul_f32_e32 v1, v1, v4
-; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; VI-NEXT: v_add_f32_e32 v1, v1, v5
+; VI-NEXT: s_and_b32 s2, s0, 0x8000
+; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT: v_or_b32_e32 v0, s2, v0
+; VI-NEXT: .LBB9_8: ; %Flow58
+; VI-NEXT: s_lshr_b32 s4, s0, 16
+; VI-NEXT: s_lshr_b32 s2, s1, 16
+; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4|
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; VI-NEXT: s_mov_b32 s3, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; VI-NEXT: s_cbranch_vccz .LBB9_10
+; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: s_and_b32 s3, s4, 0x8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: .LBB9_10: ; %Flow53
+; VI-NEXT: s_xor_b32 s3, s3, 1
+; VI-NEXT: s_cmp_lg_u32 s3, 0
+; VI-NEXT: s_cbranch_scc1 .LBB9_16
+; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; VI-NEXT: v_ldexp_f32 v2, v4, 1
+; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 11
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB9_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB9_13: ; %frem.loop_body27
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT: s_cbranch_vccnz .LBB9_13
+; VI-NEXT: s_branch .LBB9_15
+; VI-NEXT: .LBB9_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s4
-; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v1, -v1, v2, s4
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: s_and_b32 s3, s4, 0x8000
+; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v1, s3, v1
+; VI-NEXT: .LBB9_16: ; %Flow54
+; VI-NEXT: v_mov_b32_e32 v2, 0x7c00
+; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s1, 0
+; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s0|, v2
+; VI-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, 0
+; VI-NEXT: v_cmp_nge_f16_e64 s[4:5], |s4|, v2
+; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_mov_b64 vcc, s[2:3]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_mov_b64 vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
@@ -606,176 +1415,714 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
; CI-LABEL: frem_v4f16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
+; CI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
+; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; CI-NEXT: s_mov_b32 s0, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT: s_lshr_b32 s8, s2, 16
-; CI-NEXT: s_lshr_b32 s9, s3, 16
-; CI-NEXT: s_lshr_b32 s10, s4, 16
-; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0
-; CI-NEXT: s_lshr_b32 s11, s5, 16
-; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s4|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2|
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT: s_cbranch_vccz .LBB10_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s0, s4, 0x8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v3, s4
+; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT: s_mov_b32 s0, 0
+; CI-NEXT: .LBB10_2: ; %Flow135
+; CI-NEXT: s_xor_b32 s0, s0, 1
+; CI-NEXT: s_cmp_lg_u32 s0, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s8
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s10
-; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB10_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT: s_cbranch_vccnz .LBB10_5
+; CI-NEXT: s_branch .LBB10_7
+; CI-NEXT: .LBB10_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB10_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1
-; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
-; CI-NEXT: v_rcp_f32_e32 v5, v3
+; CI-NEXT: s_and_b32 s0, s4, 0x8000
+; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; CI-NEXT: v_or_b32_e32 v0, s0, v0
+; CI-NEXT: .LBB10_8: ; %Flow136
+; CI-NEXT: s_lshr_b32 s6, s4, 16
+; CI-NEXT: s_lshr_b32 s0, s2, 16
+; CI-NEXT: v_cvt_f32_f16_e64 v3, |s6|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s0|
+; CI-NEXT: s_mov_b32 s1, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; CI-NEXT: s_cbranch_vccz .LBB10_10
+; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: s_and_b32 s1, s6, 0x8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v4, s6
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT: s_mov_b32 s1, 0
+; CI-NEXT: .LBB10_10: ; %Flow131
+; CI-NEXT: s_xor_b32 s1, s1, 1
+; CI-NEXT: s_cmp_lg_u32 s1, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_16
+; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
+; CI-NEXT: v_div_scale_f32 v4, s[10:11], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT: v_fma_f32 v5, v6, v5, v5
-; CI-NEXT: v_mul_f32_e32 v6, v4, v5
-; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT: v_fma_f32 v6, v7, v5, v6
-; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
-; CI-NEXT: v_trunc_f32_e32 v3, v3
-; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v3, s5
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB10_13: ; %frem.loop_body27
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT: s_cbranch_vccnz .LBB10_13
+; CI-NEXT: s_branch .LBB10_15
+; CI-NEXT: .LBB10_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2
-; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2
-; CI-NEXT: v_rcp_f32_e32 v6, v4
+; CI-NEXT: s_and_b32 s1, s6, 0x8000
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; CI-NEXT: v_or_b32_e32 v1, s1, v1
+; CI-NEXT: .LBB10_16: ; %Flow132
+; CI-NEXT: v_cvt_f32_f16_e64 v4, |s5|
+; CI-NEXT: v_cvt_f32_f16_e64 v3, |s3|
+; CI-NEXT: s_mov_b32 s1, 1
+; CI-NEXT: ; implicit-def: $vgpr2
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
+; CI-NEXT: s_cbranch_vccz .LBB10_18
+; CI-NEXT: ; %bb.17: ; %frem.else53
+; CI-NEXT: s_and_b32 s1, s5, 0x8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
+; CI-NEXT: v_mov_b32_e32 v2, s1
+; CI-NEXT: v_mov_b32_e32 v5, s5
+; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CI-NEXT: s_mov_b32 s1, 0
+; CI-NEXT: .LBB10_18: ; %Flow127
+; CI-NEXT: s_xor_b32 s1, s1, 1
+; CI-NEXT: s_cmp_lg_u32 s1, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_24
+; CI-NEXT: ; %bb.19: ; %frem.compute52
+; CI-NEXT: v_frexp_mant_f32_e32 v5, v3
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
+; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1
+; CI-NEXT: v_div_scale_f32 v5, s[10:11], v3, v3, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v2, v4
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
+; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7
+; CI-NEXT: v_ldexp_f32_e64 v6, v2, 11
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
+; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; CI-NEXT: v_rcp_f32_e32 v10, v5
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
-; CI-NEXT: v_fma_f32 v6, v7, v6, v6
-; CI-NEXT: v_mul_f32_e32 v7, v5, v6
-; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
-; CI-NEXT: v_fma_f32 v7, v8, v6, v7
-; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
+; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; CI-NEXT: v_fma_f32 v10, v11, v10, v10
+; CI-NEXT: v_mul_f32_e32 v11, v9, v10
+; CI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; CI-NEXT: v_fma_f32 v11, v12, v10, v11
+; CI-NEXT: v_fma_f32 v5, -v5, v11, v9
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
-; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2
-; CI-NEXT: v_trunc_f32_e32 v4, v4
-; CI-NEXT: v_fma_f32 v2, -v4, v3, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v3, s9
-; CI-NEXT: v_cvt_f32_f16_e32 v4, s11
+; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
+; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_22
+; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
+; CI-NEXT: .LBB10_21: ; %frem.loop_body60
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: v_mul_f32_e32 v6, v7, v5
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT: v_add_f32_e32 v8, v6, v3
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT: v_ldexp_f32_e64 v6, v6, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4
+; CI-NEXT: s_cbranch_vccnz .LBB10_21
+; CI-NEXT: s_branch .LBB10_23
+; CI-NEXT: .LBB10_22:
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4
+; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
+; CI-NEXT: v_mul_f32_e32 v5, v4, v5
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v3, v4, v3
+; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_div_scale_f32 v5, s[2:3], v4, v4, v3
-; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3
-; CI-NEXT: v_rcp_f32_e32 v7, v5
+; CI-NEXT: s_and_b32 s1, s5, 0x8000
+; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; CI-NEXT: v_or_b32_e32 v2, s1, v2
+; CI-NEXT: .LBB10_24: ; %Flow128
+; CI-NEXT: s_lshr_b32 s7, s5, 16
+; CI-NEXT: s_lshr_b32 s10, s3, 16
+; CI-NEXT: v_cvt_f32_f16_e64 v5, |s7|
+; CI-NEXT: v_cvt_f32_f16_e64 v4, |s10|
+; CI-NEXT: s_mov_b32 s1, 1
+; CI-NEXT: ; implicit-def: $vgpr3
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
+; CI-NEXT: s_cbranch_vccz .LBB10_26
+; CI-NEXT: ; %bb.25: ; %frem.else86
+; CI-NEXT: s_and_b32 s1, s7, 0x8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v6, s7
+; CI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; CI-NEXT: s_mov_b32 s1, 0
+; CI-NEXT: .LBB10_26: ; %Flow123
+; CI-NEXT: s_xor_b32 s1, s1, 1
+; CI-NEXT: s_cmp_lg_u32 s1, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_32
+; CI-NEXT: ; %bb.27: ; %frem.compute85
+; CI-NEXT: v_frexp_mant_f32_e32 v6, v4
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
+; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1
+; CI-NEXT: v_div_scale_f32 v6, s[12:13], v4, v4, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v5
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5
+; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8
+; CI-NEXT: v_ldexp_f32_e64 v7, v3, 11
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3
+; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; CI-NEXT: v_rcp_f32_e32 v11, v6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
-; CI-NEXT: v_fma_f32 v7, v8, v7, v7
-; CI-NEXT: v_mul_f32_e32 v8, v6, v7
-; CI-NEXT: v_fma_f32 v9, -v5, v8, v6
-; CI-NEXT: v_fma_f32 v8, v9, v7, v8
-; CI-NEXT: v_fma_f32 v5, -v5, v8, v6
+; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; CI-NEXT: v_fma_f32 v11, v12, v11, v11
+; CI-NEXT: v_mul_f32_e32 v12, v10, v11
+; CI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; CI-NEXT: v_fma_f32 v12, v13, v11, v12
+; CI-NEXT: v_fma_f32 v6, -v6, v12, v10
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
+; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
+; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_30
+; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
+; CI-NEXT: .LBB10_29: ; %frem.loop_body93
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: v_mul_f32_e32 v7, v8, v6
+; CI-NEXT: v_rndne_f32_e32 v7, v7
+; CI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT: v_add_f32_e32 v9, v7, v4
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT: v_ldexp_f32_e64 v7, v7, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5
+; CI-NEXT: s_cbranch_vccnz .LBB10_29
+; CI-NEXT: s_branch .LBB10_31
+; CI-NEXT: .LBB10_30:
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5
+; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
+; CI-NEXT: v_mul_f32_e32 v6, v5, v6
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v4, v5, v4
+; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: s_and_b32 s1, s7, 0x8000
+; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v3
+; CI-NEXT: v_or_b32_e32 v3, s1, v3
+; CI-NEXT: .LBB10_32: ; %Flow124
+; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
+; CI-NEXT: v_cvt_f32_f16_e32 v5, 0
+; CI-NEXT: s_and_b32 s1, s4, 0x7fff
+; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00
+; CI-NEXT: s_cselect_b32 s11, 1, 0
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; CI-NEXT: s_and_b32 s2, s6, 0x7fff
+; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
+; CI-NEXT: s_cselect_b32 s6, 1, 0
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, s3
+; CI-NEXT: s_and_b32 s4, s5, 0x7fff
+; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
+; CI-NEXT: s_cselect_b32 s12, 1, 0
+; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, s10
+; CI-NEXT: s_and_b32 s7, s7, 0x7fff
+; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00
+; CI-NEXT: s_cselect_b32 s7, 1, 0
+; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_mov_b32_e32 v4, 0x7e00
+; CI-NEXT: s_and_b32 s10, 1, s11
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s10
+; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; CI-NEXT: s_and_b32 s0, 1, s6
+; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
-; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3
-; CI-NEXT: v_trunc_f32_e32 v5, v5
-; CI-NEXT: v_fma_f32 v3, -v5, v4, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; CI-NEXT: v_or_b32_e32 v1, v2, v1
-; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; CI-NEXT: s_and_b32 s0, 1, s12
+; CI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; CI-NEXT: s_and_b32 s0, 1, s7
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v1, v1, v2
+; CI-NEXT: s_mov_b32 s10, -1
+; CI-NEXT: s_mov_b32 s11, 0xf000
+; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; VI-NEXT: s_load_dwordx2 s[8:9], s[18:19], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
+; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s4
-; VI-NEXT: s_lshr_b32 s8, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: s_lshr_b32 s6, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v2
-; VI-NEXT: s_lshr_b32 s9, s5, 16
-; VI-NEXT: s_lshr_b32 s7, s3, 16
-; VI-NEXT: v_mul_f32_e32 v4, v0, v3
-; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT: v_mac_f32_e32 v4, v5, v3
-; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT: v_mul_f32_e32 v0, v0, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT: v_add_f32_e32 v0, v0, v4
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s8|
+; VI-NEXT: v_cvt_f32_f16_e64 v1, |s6|
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; VI-NEXT: s_cbranch_vccz .LBB10_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s0, s8, 0x8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s8
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_2: ; %Flow135
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; VI-NEXT: v_ldexp_f32 v1, v3, 1
+; VI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 11
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB10_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT: s_cbranch_vccnz .LBB10_5
+; VI-NEXT: s_branch .LBB10_7
+; VI-NEXT: .LBB10_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB10_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
-; VI-NEXT: v_rcp_f32_e32 v4, v3
-; VI-NEXT: v_mul_f32_e32 v5, v1, v4
-; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
-; VI-NEXT: v_mac_f32_e32 v5, v6, v4
-; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
-; VI-NEXT: v_mul_f32_e32 v1, v1, v4
-; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; VI-NEXT: v_add_f32_e32 v1, v1, v5
+; VI-NEXT: s_and_b32 s0, s8, 0x8000
+; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT: v_or_b32_e32 v0, s0, v0
+; VI-NEXT: .LBB10_8: ; %Flow136
+; VI-NEXT: s_lshr_b32 s4, s8, 16
+; VI-NEXT: s_lshr_b32 s2, s6, 16
+; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4|
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; VI-NEXT: s_mov_b32 s0, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; VI-NEXT: s_cbranch_vccz .LBB10_10
+; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: s_and_b32 s0, s4, 0x8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_10: ; %Flow131
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_16
+; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; VI-NEXT: v_ldexp_f32 v2, v4, 1
+; VI-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 11
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB10_13: ; %frem.loop_body27
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT: s_cbranch_vccnz .LBB10_13
+; VI-NEXT: s_branch .LBB10_15
+; VI-NEXT: .LBB10_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
-; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT: v_rcp_f32_e32 v5, v4
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_mul_f32_e32 v6, v2, v5
-; VI-NEXT: v_mad_f32 v7, -v4, v6, v2
-; VI-NEXT: v_mac_f32_e32 v6, v7, v5
-; VI-NEXT: v_mad_f32 v2, -v4, v6, v2
-; VI-NEXT: v_mul_f32_e32 v2, v2, v5
-; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; VI-NEXT: v_add_f32_e32 v2, v2, v6
+; VI-NEXT: s_and_b32 s0, s4, 0x8000
+; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; VI-NEXT: v_or_b32_e32 v1, s0, v1
+; VI-NEXT: .LBB10_16: ; %Flow132
+; VI-NEXT: v_cvt_f32_f16_e64 v4, |s9|
+; VI-NEXT: v_cvt_f32_f16_e64 v3, |s7|
+; VI-NEXT: s_mov_b32 s0, 1
+; VI-NEXT: ; implicit-def: $vgpr2
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
+; VI-NEXT: s_cbranch_vccz .LBB10_18
+; VI-NEXT: ; %bb.17: ; %frem.else53
+; VI-NEXT: s_and_b32 s0, s9, 0x8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_18: ; %Flow127
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_24
+; VI-NEXT: ; %bb.19: ; %frem.compute52
+; VI-NEXT: v_frexp_mant_f32_e32 v5, v3
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
+; VI-NEXT: v_ldexp_f32 v3, v5, 1
+; VI-NEXT: v_div_scale_f32 v5, s[0:1], v3, v3, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v2, v4
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7
+; VI-NEXT: v_ldexp_f32 v6, v2, 11
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2
+; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; VI-NEXT: v_rcp_f32_e32 v10, v5
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; VI-NEXT: v_fma_f32 v10, v11, v10, v10
+; VI-NEXT: v_mul_f32_e32 v11, v9, v10
+; VI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; VI-NEXT: v_fma_f32 v11, v12, v10, v11
+; VI-NEXT: v_fma_f32 v5, -v5, v11, v9
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
+; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_22
+; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
+; VI-NEXT: .LBB10_21: ; %frem.loop_body60
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mul_f32_e32 v6, v7, v5
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT: v_add_f32_e32 v8, v6, v3
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT: v_ldexp_f32 v6, v6, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4
+; VI-NEXT: s_cbranch_vccnz .LBB10_21
+; VI-NEXT: s_branch .LBB10_23
+; VI-NEXT: .LBB10_22:
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4
+; VI-NEXT: v_ldexp_f32 v4, v7, v4
+; VI-NEXT: v_mul_f32_e32 v5, v4, v5
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT: v_ldexp_f32 v2, v3, v2
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT: v_mov_b32_e32 v4, s9
-; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
-; VI-NEXT: v_trunc_f16_e32 v2, v2
-; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
-; VI-NEXT: v_rcp_f32_e32 v6, v5
-; VI-NEXT: v_mul_f32_e32 v7, v3, v6
-; VI-NEXT: v_mad_f32 v8, -v5, v7, v3
-; VI-NEXT: v_mac_f32_e32 v7, v8, v6
-; VI-NEXT: v_mad_f32 v3, -v5, v7, v3
-; VI-NEXT: v_mul_f32_e32 v3, v3, v6
-; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; VI-NEXT: v_add_f32_e32 v3, v3, v7
+; VI-NEXT: s_and_b32 s0, s9, 0x8000
+; VI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; VI-NEXT: v_or_b32_e32 v2, s0, v2
+; VI-NEXT: .LBB10_24: ; %Flow128
+; VI-NEXT: s_lshr_b32 s12, s9, 16
+; VI-NEXT: s_lshr_b32 s10, s7, 16
+; VI-NEXT: v_cvt_f32_f16_e64 v5, |s12|
+; VI-NEXT: v_cvt_f32_f16_e64 v4, |s10|
+; VI-NEXT: s_mov_b32 s0, 1
+; VI-NEXT: ; implicit-def: $vgpr3
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
+; VI-NEXT: s_cbranch_vccz .LBB10_26
+; VI-NEXT: ; %bb.25: ; %frem.else86
+; VI-NEXT: s_and_b32 s0, s12, 0x8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_26: ; %Flow123
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_32
+; VI-NEXT: ; %bb.27: ; %frem.compute85
+; VI-NEXT: v_frexp_mant_f32_e32 v6, v4
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
+; VI-NEXT: v_ldexp_f32 v4, v6, 1
+; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v5
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5
+; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8
+; VI-NEXT: v_ldexp_f32 v7, v3, 11
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; VI-NEXT: v_rcp_f32_e32 v11, v6
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; VI-NEXT: v_fma_f32 v11, v12, v11, v11
+; VI-NEXT: v_mul_f32_e32 v12, v10, v11
+; VI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; VI-NEXT: v_fma_f32 v12, v13, v11, v12
+; VI-NEXT: v_fma_f32 v6, -v6, v12, v10
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
+; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_30
+; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
+; VI-NEXT: .LBB10_29: ; %frem.loop_body93
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v7, v8, v6
+; VI-NEXT: v_rndne_f32_e32 v7, v7
+; VI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT: v_add_f32_e32 v9, v7, v4
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT: v_ldexp_f32 v7, v7, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5
+; VI-NEXT: s_cbranch_vccnz .LBB10_29
+; VI-NEXT: s_branch .LBB10_31
+; VI-NEXT: .LBB10_30:
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5
+; VI-NEXT: v_ldexp_f32 v5, v8, v5
+; VI-NEXT: v_mul_f32_e32 v6, v5, v6
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v4, v5, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT: v_ldexp_f32 v3, v4, v3
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
-; VI-NEXT: v_trunc_f16_e32 v3, v3
-; VI-NEXT: v_fma_f16 v3, -v3, v4, s7
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_and_b32 s0, s12, 0x8000
+; VI-NEXT: v_and_b32_e32 v3, 0x7fff, v3
+; VI-NEXT: v_or_b32_e32 v3, s0, v3
+; VI-NEXT: .LBB10_32: ; %Flow124
+; VI-NEXT: v_mov_b32_e32 v4, 0x7c00
+; VI-NEXT: v_cmp_nlg_f16_e64 vcc, s6, 0
+; VI-NEXT: v_cmp_nge_f16_e64 s[0:1], |s8|, v4
+; VI-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, 0
+; VI-NEXT: v_cmp_nge_f16_e64 s[4:5], |s4|, v4
+; VI-NEXT: v_cmp_nge_f16_e64 s[8:9], |s9|, v4
+; VI-NEXT: v_cmp_nge_f16_e64 s[12:13], |s12|, v4
+; VI-NEXT: v_mov_b32_e32 v4, 0x7e00
+; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_mov_b64 vcc, s[2:3]
+; VI-NEXT: v_cmp_nlg_f16_e64 s[6:7], s7, 0
+; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_mov_b64 vcc, s[4:5]
+; VI-NEXT: v_cmp_nlg_f16_e64 s[10:11], s10, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: s_mov_b64 vcc, s[6:7]
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_mov_b64 vcc, s[10:11]
+; VI-NEXT: v_cndmask_b32_sdwa v2, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_mov_b64 vcc, s[12:13]
+; VI-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[8:9]
+; VI-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: v_mov_b32_e32 v3, s17
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
@@ -791,43 +2138,171 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
-; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
-; CI-NEXT: v_rcp_f32_e32 v3, v1
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB11_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s6, s2, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s6
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB11_2: ; %Flow53
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB11_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT: v_fma_f32 v3, v4, v3, v3
-; CI-NEXT: v_mul_f32_e32 v4, v2, v3
-; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT: v_fma_f32 v4, v5, v3, v4
-; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
-; CI-NEXT: v_trunc_f32_e32 v1, v1
-; CI-NEXT: v_fma_f32 v0, -v1, v0, s2
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB11_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB11_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT: s_cbranch_vccnz .LBB11_5
+; CI-NEXT: s_branch .LBB11_7
+; CI-NEXT: .LBB11_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB11_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; CI-NEXT: s_and_b32 s6, s2, 0x80000000
+; CI-NEXT: v_or_b32_e32 v0, s6, v0
+; CI-NEXT: .LBB11_8: ; %Flow54
; CI-NEXT: v_mov_b32_e32 v1, s5
-; CI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3
-; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
+; CI-NEXT: s_mov_b32 s6, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB11_10
+; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: s_and_b32 s6, s3, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v2, s5
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
+; CI-NEXT: v_mov_b32_e32 v2, s6
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB11_10: ; %Flow49
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB11_16
+; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
+; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5|
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB11_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB11_13: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT: s_cbranch_vccnz .LBB11_13
+; CI-NEXT: s_branch .LBB11_15
+; CI-NEXT: .LBB11_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
+; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT: s_and_b32 s6, s3, 0x80000000
+; CI-NEXT: v_or_b32_e32 v1, s6, v1
+; CI-NEXT: .LBB11_16: ; %Flow50
+; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
+; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v3, 0x7f800000
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v3
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s5, 0
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s3|, v3
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v1, -v2, v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -836,42 +2311,170 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
-; VI-NEXT: v_rcp_f32_e32 v3, v1
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB11_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s6, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB11_2: ; %Flow53
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB11_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT: v_fma_f32 v3, v4, v3, v3
-; VI-NEXT: v_mul_f32_e32 v4, v2, v3
-; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT: v_fma_f32 v4, v5, v3, v4
-; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
-; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v0, -v1, v0, s2
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB11_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB11_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT: s_cbranch_vccnz .LBB11_5
+; VI-NEXT: s_branch .LBB11_7
+; VI-NEXT: .LBB11_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB11_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; VI-NEXT: s_and_b32 s6, s2, 0x80000000
+; VI-NEXT: v_or_b32_e32 v0, s6, v0
+; VI-NEXT: .LBB11_8: ; %Flow54
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3
-; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
-; VI-NEXT: v_rcp_f32_e32 v4, v2
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
+; VI-NEXT: s_mov_b32 s6, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB11_10
+; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: s_and_b32 s6, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB11_10: ; %Flow49
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB11_16
+; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
+; VI-NEXT: v_ldexp_f32 v2, v2, 1
+; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5|
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 12
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; VI-NEXT: v_fma_f32 v4, v5, v4, v4
-; VI-NEXT: v_mul_f32_e32 v5, v3, v4
-; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; VI-NEXT: v_fma_f32 v5, v6, v4, v5
-; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
-; VI-NEXT: v_trunc_f32_e32 v2, v2
-; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB11_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB11_13: ; %frem.loop_body23
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT: s_cbranch_vccnz .LBB11_13
+; VI-NEXT: s_branch .LBB11_15
+; VI-NEXT: .LBB11_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
+; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; VI-NEXT: s_and_b32 s6, s3, 0x80000000
+; VI-NEXT: v_or_b32_e32 v1, s6, v1
+; VI-NEXT: .LBB11_16: ; %Flow50
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
+; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; VI-NEXT: v_mov_b32_e32 v3, 0x7f800000
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s2|, v3
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s5, 0
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s3|, v3
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -892,73 +2495,327 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
+; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s8
-; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4
-; CI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4
-; CI-NEXT: v_rcp_f32_e32 v3, v1
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB12_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s2, s4, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s8
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_2: ; %Flow127
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s4|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT: v_fma_f32 v3, v4, v3, v3
-; CI-NEXT: v_mul_f32_e32 v4, v2, v3
-; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT: v_fma_f32 v4, v5, v3, v4
-; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s4
-; CI-NEXT: v_trunc_f32_e32 v1, v1
-; CI-NEXT: v_fma_f32 v0, -v1, v0, s4
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB12_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT: s_cbranch_vccnz .LBB12_5
+; CI-NEXT: s_branch .LBB12_7
+; CI-NEXT: .LBB12_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB12_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; CI-NEXT: s_and_b32 s2, s4, 0x80000000
+; CI-NEXT: v_or_b32_e32 v0, s2, v0
+; CI-NEXT: .LBB12_8: ; %Flow128
; CI-NEXT: v_mov_b32_e32 v1, s9
-; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5
-; CI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB12_10
+; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: s_and_b32 s2, s5, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v2, s9
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_10: ; %Flow123
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_16
+; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
+; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s5|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9|
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s5
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v1, -v2, v1, s5
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB12_13: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT: s_cbranch_vccnz .LBB12_13
+; CI-NEXT: s_branch .LBB12_15
+; CI-NEXT: .LBB12_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
+; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT: s_and_b32 s2, s5, 0x80000000
+; CI-NEXT: v_or_b32_e32 v1, s2, v1
+; CI-NEXT: .LBB12_16: ; %Flow124
; CI-NEXT: v_mov_b32_e32 v2, s10
-; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6
-; CI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6
-; CI-NEXT: v_rcp_f32_e32 v5, v3
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr2
+; CI-NEXT: s_cbranch_vccz .LBB12_18
+; CI-NEXT: ; %bb.17: ; %frem.else47
+; CI-NEXT: s_and_b32 s2, s6, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v3, s10
+; CI-NEXT: v_mov_b32_e32 v2, s6
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_18: ; %Flow119
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_24
+; CI-NEXT: ; %bb.19: ; %frem.compute46
+; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
+; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1
+; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v2, |s6|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10|
+; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7
+; CI-NEXT: v_ldexp_f32_e64 v6, v2, 12
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
+; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; CI-NEXT: v_rcp_f32_e32 v10, v5
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT: v_fma_f32 v5, v6, v5, v5
-; CI-NEXT: v_mul_f32_e32 v6, v4, v5
-; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT: v_fma_f32 v6, v7, v5, v6
-; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; CI-NEXT: v_fma_f32 v10, v11, v10, v10
+; CI-NEXT: v_mul_f32_e32 v11, v9, v10
+; CI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; CI-NEXT: v_fma_f32 v11, v12, v10, v11
+; CI-NEXT: v_fma_f32 v5, -v5, v11, v9
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s6
-; CI-NEXT: v_trunc_f32_e32 v3, v3
-; CI-NEXT: v_fma_f32 v2, -v3, v2, s6
+; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
+; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_22
+; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
+; CI-NEXT: .LBB12_21: ; %frem.loop_body54
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: v_mul_f32_e32 v6, v7, v5
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT: v_add_f32_e32 v8, v6, v3
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT: v_add_i32_e32 v4, vcc, -12, v4
+; CI-NEXT: v_ldexp_f32_e64 v6, v6, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4
+; CI-NEXT: s_cbranch_vccnz .LBB12_21
+; CI-NEXT: s_branch .LBB12_23
+; CI-NEXT: .LBB12_22:
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
+; CI-NEXT: v_mul_f32_e32 v5, v4, v5
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v3, v4, v3
+; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2
+; CI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
+; CI-NEXT: s_and_b32 s2, s6, 0x80000000
+; CI-NEXT: v_or_b32_e32 v2, s2, v2
+; CI-NEXT: .LBB12_24: ; %Flow120
; CI-NEXT: v_mov_b32_e32 v3, s11
-; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7
-; CI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7
-; CI-NEXT: v_rcp_f32_e32 v6, v4
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr3
+; CI-NEXT: s_cbranch_vccz .LBB12_26
+; CI-NEXT: ; %bb.25: ; %frem.else78
+; CI-NEXT: s_and_b32 s2, s7, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v4, s11
+; CI-NEXT: v_mov_b32_e32 v3, s7
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
+; CI-NEXT: v_mov_b32_e32 v4, s2
+; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_26: ; %Flow115
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_32
+; CI-NEXT: ; %bb.27: ; %frem.compute77
+; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1
+; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v3, |s7|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11|
+; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8
+; CI-NEXT: v_ldexp_f32_e64 v7, v3, 12
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3
+; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; CI-NEXT: v_rcp_f32_e32 v11, v6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
-; CI-NEXT: v_fma_f32 v6, v7, v6, v6
-; CI-NEXT: v_mul_f32_e32 v7, v5, v6
-; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
-; CI-NEXT: v_fma_f32 v7, v8, v6, v7
-; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
+; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; CI-NEXT: v_fma_f32 v11, v12, v11, v11
+; CI-NEXT: v_mul_f32_e32 v12, v10, v11
+; CI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; CI-NEXT: v_fma_f32 v12, v13, v11, v12
+; CI-NEXT: v_fma_f32 v6, -v6, v12, v10
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
+; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
+; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_30
+; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
+; CI-NEXT: .LBB12_29: ; %frem.loop_body85
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: v_mul_f32_e32 v7, v8, v6
+; CI-NEXT: v_rndne_f32_e32 v7, v7
+; CI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT: v_add_f32_e32 v9, v7, v4
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT: v_add_i32_e32 v5, vcc, -12, v5
+; CI-NEXT: v_ldexp_f32_e64 v7, v7, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5
+; CI-NEXT: s_cbranch_vccnz .LBB12_29
+; CI-NEXT: s_branch .LBB12_31
+; CI-NEXT: .LBB12_30:
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
+; CI-NEXT: v_mul_f32_e32 v6, v5, v6
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v4, v5, v4
+; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
+; CI-NEXT: s_and_b32 s2, s7, 0x80000000
+; CI-NEXT: v_or_b32_e32 v3, s2, v3
+; CI-NEXT: .LBB12_32: ; %Flow116
+; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s8, 0
+; CI-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s4|, v5
+; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s9, 0
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s5|, v5
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s10, 0
+; CI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s6|, v5
+; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT: v_cmp_nlg_f32_e64 vcc, s11, 0
+; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT: v_cmp_nge_f32_e64 vcc, |s7|, v5
+; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s7
-; CI-NEXT: v_trunc_f32_e32 v4, v4
-; CI-NEXT: v_fma_f32 v3, -v4, v3, s7
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -969,71 +2826,325 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
+; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4
-; VI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4
-; VI-NEXT: v_rcp_f32_e32 v3, v1
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB12_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s2, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s8
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_2: ; %Flow127
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s4|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT: v_fma_f32 v3, v4, v3, v3
-; VI-NEXT: v_mul_f32_e32 v4, v2, v3
-; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT: v_fma_f32 v4, v5, v3, v4
-; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s4
-; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v0, -v1, v0, s4
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB12_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT: s_cbranch_vccnz .LBB12_5
+; VI-NEXT: s_branch .LBB12_7
+; VI-NEXT: .LBB12_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB12_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; VI-NEXT: s_and_b32 s2, s4, 0x80000000
+; VI-NEXT: v_or_b32_e32 v0, s2, v0
+; VI-NEXT: .LBB12_8: ; %Flow128
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5
-; VI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5
-; VI-NEXT: v_rcp_f32_e32 v4, v2
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB12_10
+; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: s_and_b32 s2, s5, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s9
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_10: ; %Flow123
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_16
+; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
+; VI-NEXT: v_ldexp_f32 v2, v2, 1
+; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s5|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9|
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 12
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; VI-NEXT: v_fma_f32 v4, v5, v4, v4
-; VI-NEXT: v_mul_f32_e32 v5, v3, v4
-; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; VI-NEXT: v_fma_f32 v5, v6, v4, v5
-; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s5
-; VI-NEXT: v_trunc_f32_e32 v2, v2
-; VI-NEXT: v_fma_f32 v1, -v2, v1, s5
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB12_13: ; %frem.loop_body23
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT: s_cbranch_vccnz .LBB12_13
+; VI-NEXT: s_branch .LBB12_15
+; VI-NEXT: .LBB12_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
+; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; VI-NEXT: s_and_b32 s2, s5, 0x80000000
+; VI-NEXT: v_or_b32_e32 v1, s2, v1
+; VI-NEXT: .LBB12_16: ; %Flow124
; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6
-; VI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6
-; VI-NEXT: v_rcp_f32_e32 v5, v3
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr2
+; VI-NEXT: s_cbranch_vccz .LBB12_18
+; VI-NEXT: ; %bb.17: ; %frem.else47
+; VI-NEXT: s_and_b32 s2, s6, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s10
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_18: ; %Flow119
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_24
+; VI-NEXT: ; %bb.19: ; %frem.compute46
+; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
+; VI-NEXT: v_ldexp_f32 v3, v3, 1
+; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v2, |s6|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10|
+; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7
+; VI-NEXT: v_ldexp_f32 v6, v2, 12
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2
+; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; VI-NEXT: v_rcp_f32_e32 v10, v5
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; VI-NEXT: v_fma_f32 v5, v6, v5, v5
-; VI-NEXT: v_mul_f32_e32 v6, v4, v5
-; VI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; VI-NEXT: v_fma_f32 v6, v7, v5, v6
-; VI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; VI-NEXT: v_fma_f32 v10, v11, v10, v10
+; VI-NEXT: v_mul_f32_e32 v11, v9, v10
+; VI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; VI-NEXT: v_fma_f32 v11, v12, v10, v11
+; VI-NEXT: v_fma_f32 v5, -v5, v11, v9
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s6
-; VI-NEXT: v_trunc_f32_e32 v3, v3
-; VI-NEXT: v_fma_f32 v2, -v3, v2, s6
+; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
+; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_22
+; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
+; VI-NEXT: .LBB12_21: ; %frem.loop_body54
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mul_f32_e32 v6, v7, v5
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT: v_add_f32_e32 v8, v6, v3
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, -12, v4
+; VI-NEXT: v_ldexp_f32 v6, v6, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4
+; VI-NEXT: s_cbranch_vccnz .LBB12_21
+; VI-NEXT: s_branch .LBB12_23
+; VI-NEXT: .LBB12_22:
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT: v_ldexp_f32 v4, v7, v4
+; VI-NEXT: v_mul_f32_e32 v5, v4, v5
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT: v_ldexp_f32 v2, v3, v2
+; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
+; VI-NEXT: s_and_b32 s2, s6, 0x80000000
+; VI-NEXT: v_or_b32_e32 v2, s2, v2
+; VI-NEXT: .LBB12_24: ; %Flow120
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7
-; VI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7
-; VI-NEXT: v_rcp_f32_e32 v6, v4
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr3
+; VI-NEXT: s_cbranch_vccz .LBB12_26
+; VI-NEXT: ; %bb.25: ; %frem.else78
+; VI-NEXT: s_and_b32 s2, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v4, s11
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_26: ; %Flow115
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_32
+; VI-NEXT: ; %bb.27: ; %frem.compute77
+; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
+; VI-NEXT: v_ldexp_f32 v4, v4, 1
+; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v3, |s7|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11|
+; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8
+; VI-NEXT: v_ldexp_f32 v7, v3, 12
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; VI-NEXT: v_rcp_f32_e32 v11, v6
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
-; VI-NEXT: v_fma_f32 v6, v7, v6, v6
-; VI-NEXT: v_mul_f32_e32 v7, v5, v6
-; VI-NEXT: v_fma_f32 v8, -v4, v7, v5
-; VI-NEXT: v_fma_f32 v7, v8, v6, v7
-; VI-NEXT: v_fma_f32 v4, -v4, v7, v5
+; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; VI-NEXT: v_fma_f32 v11, v12, v11, v11
+; VI-NEXT: v_mul_f32_e32 v12, v10, v11
+; VI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; VI-NEXT: v_fma_f32 v12, v13, v11, v12
+; VI-NEXT: v_fma_f32 v6, -v6, v12, v10
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
-; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s7
-; VI-NEXT: v_trunc_f32_e32 v4, v4
-; VI-NEXT: v_fma_f32 v3, -v4, v3, s7
+; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
+; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_30
+; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
+; VI-NEXT: .LBB12_29: ; %frem.loop_body85
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v7, v8, v6
+; VI-NEXT: v_rndne_f32_e32 v7, v7
+; VI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT: v_add_f32_e32 v9, v7, v4
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, -12, v5
+; VI-NEXT: v_ldexp_f32 v7, v7, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5
+; VI-NEXT: s_cbranch_vccnz .LBB12_29
+; VI-NEXT: s_branch .LBB12_31
+; VI-NEXT: .LBB12_30:
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT: v_ldexp_f32 v5, v8, v5
+; VI-NEXT: v_mul_f32_e32 v6, v5, v6
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v4, v5, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT: v_ldexp_f32 v3, v4, v3
+; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
+; VI-NEXT: s_and_b32 s2, s7, 0x80000000
+; VI-NEXT: v_or_b32_e32 v3, s2, v3
+; VI-NEXT: .LBB12_32: ; %Flow116
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s8, 0
+; VI-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; VI-NEXT: v_mov_b32_e32 v5, 0x7f800000
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s4|, v5
+; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s9, 0
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s5|, v5
+; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s10, 0
+; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s6|, v5
+; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s11, 0
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT: v_cmp_nge_f32_e64 vcc, |s7|, v5
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -1054,39 +3165,198 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
+; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: v_mov_b32_e32 v1, s9
-; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
-; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
-; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB13_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: v_mov_b32_e32 v0, s8
+; CI-NEXT: v_mov_b32_e32 v1, s9
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v2, s4
+; CI-NEXT: v_mov_b32_e32 v3, s5
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB13_2: ; %Flow53
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB13_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB13_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT: .LBB13_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT: s_cbranch_vccnz .LBB13_5
+; CI-NEXT: s_branch .LBB13_7
+; CI-NEXT: .LBB13_6:
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: .LBB13_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT: v_or_b32_e32 v0, s2, v0
+; CI-NEXT: v_or_b32_e32 v1, s3, v1
+; CI-NEXT: .LBB13_8: ; %Flow54
+; CI-NEXT: v_mov_b32_e32 v2, s10
+; CI-NEXT: v_mov_b32_e32 v3, s11
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CI-NEXT: s_cbranch_vccz .LBB13_10
+; CI-NEXT: ; %bb.9: ; %frem.else16
; CI-NEXT: v_mov_b32_e32 v2, s10
; CI-NEXT: v_mov_b32_e32 v3, s11
-; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
-; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_mov_b32_e32 v4, s6
+; CI-NEXT: v_mov_b32_e32 v5, s7
+; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB13_10: ; %Flow49
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB13_16
+; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; CI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v8
+; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v9
+; CI-NEXT: v_sub_i32_e32 v11, vcc, v4, v10
+; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1
+; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; CI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
+; CI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5]
+; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; CI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15]
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
+; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB13_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8
+; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9
+; CI-NEXT: .LBB13_13: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v9, v7
+; CI-NEXT: v_mov_b32_e32 v8, v6
+; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
+; CI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7]
+; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3]
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
+; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT: v_add_i32_e32 v11, vcc, 0xffffffe6, v11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11
+; CI-NEXT: s_cbranch_vccnz .LBB13_13
+; CI-NEXT: s_branch .LBB13_15
+; CI-NEXT: .LBB13_14:
+; CI-NEXT: v_mov_b32_e32 v9, v7
+; CI-NEXT: v_mov_b32_e32 v8, v6
+; CI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11
+; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10
+; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
+; CI-NEXT: v_or_b32_e32 v2, s2, v2
+; CI-NEXT: v_or_b32_e32 v3, s3, v3
+; CI-NEXT: .LBB13_16: ; %Flow50
+; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[8:9], 0
+; CI-NEXT: v_mov_b32_e32 v4, 0
+; CI-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; CI-NEXT: v_mov_b32_e32 v5, 0x7ff00000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
-; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
-; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
-; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
+; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; CI-NEXT: v_cmp_nge_f64_e64 vcc, |s[4:5]|, v[4:5]
+; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; CI-NEXT: v_cmp_nlg_f64_e64 vcc, s[10:11], 0
+; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; CI-NEXT: v_cmp_nge_f64_e64 vcc, |s[6:7]|, v[4:5]
+; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -1097,39 +3367,198 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
+; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
-; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
-; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB13_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB13_2: ; %Flow53
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB13_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB13_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT: .LBB13_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT: s_cbranch_vccnz .LBB13_5
+; VI-NEXT: s_branch .LBB13_7
+; VI-NEXT: .LBB13_6:
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: .LBB13_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
+; VI-NEXT: v_or_b32_e32 v0, s2, v0
+; VI-NEXT: v_or_b32_e32 v1, s3, v1
+; VI-NEXT: .LBB13_8: ; %Flow54
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr2_vgpr3
+; VI-NEXT: s_cbranch_vccz .LBB13_10
+; VI-NEXT: ; %bb.9: ; %frem.else16
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
-; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
-; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
-; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
-; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
-; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB13_10: ; %Flow49
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB13_16
+; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; VI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v8
+; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v9
+; VI-NEXT: v_sub_u32_e32 v11, vcc, v4, v10
+; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1
+; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; VI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5]
+; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; VI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15]
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
+; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB13_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8
+; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9
+; VI-NEXT: .LBB13_13: ; %frem.loop_body23
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
+; VI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7]
+; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
+; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0xffffffe6, v11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11
+; VI-NEXT: s_cbranch_vccnz .LBB13_13
+; VI-NEXT: s_branch .LBB13_15
+; VI-NEXT: .LBB13_14:
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11
+; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10
+; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3
+; VI-NEXT: v_or_b32_e32 v2, s2, v2
+; VI-NEXT: v_or_b32_e32 v3, s3, v3
+; VI-NEXT: .LBB13_16: ; %Flow50
+; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[8:9], 0
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; VI-NEXT: v_mov_b32_e32 v5, 0x7ff00000
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; VI-NEXT: v_cmp_nge_f64_e64 vcc, |s[4:5]|, v[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; VI-NEXT: v_cmp_nlg_f64_e64 vcc, s[10:11], 0
+; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; VI-NEXT: v_cmp_nge_f64_e64 vcc, |s[6:7]|, v[4:5]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e9..bd5303213a69 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -396,8 +396,7 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1
@@ -784,19 +783,17 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX6-LABEL: v_fshl_v2i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX6-NEXT: v_and_b32_e32 v5, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0
; GFX6-NEXT: v_bfe_u32 v5, v1, 1, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5
+; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v4
-; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8
-; GFX6-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1
@@ -974,100 +971,98 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX8-LABEL: s_fshl_v4i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s6, s1, 8
-; GFX8-NEXT: s_lshr_b32 s7, s1, 16
-; GFX8-NEXT: s_lshr_b32 s8, s1, 24
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshr_b32 s9, s2, 8
-; GFX8-NEXT: s_lshr_b32 s10, s2, 16
-; GFX8-NEXT: s_lshr_b32 s11, s2, 24
-; GFX8-NEXT: s_and_b32 s12, s2, 7
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s11, s2, 7
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_lshr_b32 s5, s0, 24
-; GFX8-NEXT: s_lshl_b32 s0, s0, s12
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s9, 7
-; GFX8-NEXT: s_and_b32 s2, s6, 0xff
-; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_andn2_b32 s3, 7, s9
-; GFX8-NEXT: s_lshr_b32 s2, s2, s3
-; GFX8-NEXT: s_or_b32 s1, s1, s2
-; GFX8-NEXT: s_and_b32 s2, s10, 7
-; GFX8-NEXT: s_and_b32 s3, s7, 0xff
-; GFX8-NEXT: s_lshl_b32 s2, s4, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s11
+; GFX8-NEXT: s_and_b32 s11, s1, 0xff
+; GFX8-NEXT: s_lshr_b32 s8, s2, 8
+; GFX8-NEXT: s_lshr_b32 s9, s2, 16
+; GFX8-NEXT: s_lshr_b32 s10, s2, 24
+; GFX8-NEXT: s_lshr_b32 s11, s11, 1
+; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_lshr_b32 s2, s11, s2
+; GFX8-NEXT: s_lshr_b32 s6, s1, 8
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s8, 7
+; GFX8-NEXT: s_lshl_b32 s2, s3, s2
+; GFX8-NEXT: s_and_b32 s3, s6, 0xff
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_andn2_b32 s4, 7, s10
-; GFX8-NEXT: s_lshr_b32 s3, s3, s4
+; GFX8-NEXT: s_andn2_b32 s6, 7, s8
+; GFX8-NEXT: s_lshr_b32 s3, s3, s6
+; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_or_b32 s2, s2, s3
-; GFX8-NEXT: s_and_b32 s3, s11, 7
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s3, s5, s3
-; GFX8-NEXT: s_lshr_b32 s4, s8, 1
-; GFX8-NEXT: s_andn2_b32 s5, 7, s11
-; GFX8-NEXT: s_and_b32 s0, s0, 0xff
-; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_lshr_b32 s4, s4, s5
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s2, 0xff
+; GFX8-NEXT: s_and_b32 s3, s9, 7
+; GFX8-NEXT: s_lshl_b32 s3, s4, s3
+; GFX8-NEXT: s_and_b32 s4, s7, 0xff
+; GFX8-NEXT: s_lshr_b32 s4, s4, 1
+; GFX8-NEXT: s_andn2_b32 s6, 7, s9
+; GFX8-NEXT: s_lshr_b32 s4, s4, s6
; GFX8-NEXT: s_or_b32 s3, s3, s4
-; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s3, 0xff
+; GFX8-NEXT: s_and_b32 s4, s10, 7
+; GFX8-NEXT: s_lshl_b32 s4, s5, s4
+; GFX8-NEXT: s_lshr_b32 s1, s1, 25
+; GFX8-NEXT: s_andn2_b32 s5, 7, s10
+; GFX8-NEXT: s_and_b32 s2, s2, 0xff
+; GFX8-NEXT: s_lshr_b32 s1, s1, s5
+; GFX8-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NEXT: s_lshl_b32 s2, s2, 8
+; GFX8-NEXT: s_or_b32 s1, s4, s1
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s3, 0xff
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_lshl_b32 s1, s1, 24
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshl_v4i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s6, s1, 8
-; GFX9-NEXT: s_lshr_b32 s7, s1, 16
-; GFX9-NEXT: s_lshr_b32 s8, s1, 24
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshr_b32 s9, s2, 8
-; GFX9-NEXT: s_lshr_b32 s10, s2, 16
-; GFX9-NEXT: s_lshr_b32 s11, s2, 24
-; GFX9-NEXT: s_and_b32 s12, s2, 7
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s11, s2, 7
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s0, 24
-; GFX9-NEXT: s_lshl_b32 s0, s0, s12
-; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s1, s9, 7
-; GFX9-NEXT: s_and_b32 s2, s6, 0xff
-; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_lshr_b32 s2, s2, 1
-; GFX9-NEXT: s_andn2_b32 s3, 7, s9
-; GFX9-NEXT: s_lshr_b32 s2, s2, s3
-; GFX9-NEXT: s_or_b32 s1, s1, s2
-; GFX9-NEXT: s_and_b32 s2, s10, 7
-; GFX9-NEXT: s_and_b32 s3, s7, 0xff
-; GFX9-NEXT: s_lshl_b32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, s11
+; GFX9-NEXT: s_and_b32 s11, s1, 0xff
+; GFX9-NEXT: s_lshr_b32 s8, s2, 8
+; GFX9-NEXT: s_lshr_b32 s9, s2, 16
+; GFX9-NEXT: s_lshr_b32 s10, s2, 24
+; GFX9-NEXT: s_lshr_b32 s11, s11, 1
+; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_lshr_b32 s2, s11, s2
+; GFX9-NEXT: s_lshr_b32 s6, s1, 8
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s2, s8, 7
+; GFX9-NEXT: s_lshl_b32 s2, s3, s2
+; GFX9-NEXT: s_and_b32 s3, s6, 0xff
; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_andn2_b32 s4, 7, s10
-; GFX9-NEXT: s_lshr_b32 s3, s3, s4
+; GFX9-NEXT: s_andn2_b32 s6, 7, s8
+; GFX9-NEXT: s_lshr_b32 s3, s3, s6
+; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_or_b32 s2, s2, s3
-; GFX9-NEXT: s_and_b32 s3, s11, 7
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s3, s5, s3
-; GFX9-NEXT: s_lshr_b32 s4, s8, 1
-; GFX9-NEXT: s_andn2_b32 s5, 7, s11
-; GFX9-NEXT: s_and_b32 s0, s0, 0xff
-; GFX9-NEXT: s_lshl_b32 s1, s1, 8
-; GFX9-NEXT: s_lshr_b32 s4, s4, s5
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s1, s2, 0xff
+; GFX9-NEXT: s_and_b32 s3, s9, 7
+; GFX9-NEXT: s_lshl_b32 s3, s4, s3
+; GFX9-NEXT: s_and_b32 s4, s7, 0xff
+; GFX9-NEXT: s_lshr_b32 s4, s4, 1
+; GFX9-NEXT: s_andn2_b32 s6, 7, s9
+; GFX9-NEXT: s_lshr_b32 s4, s4, s6
; GFX9-NEXT: s_or_b32 s3, s3, s4
-; GFX9-NEXT: s_lshl_b32 s1, s1, 16
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s1, s3, 0xff
+; GFX9-NEXT: s_and_b32 s4, s10, 7
+; GFX9-NEXT: s_lshl_b32 s4, s5, s4
+; GFX9-NEXT: s_lshr_b32 s1, s1, 25
+; GFX9-NEXT: s_andn2_b32 s5, 7, s10
+; GFX9-NEXT: s_and_b32 s2, s2, 0xff
+; GFX9-NEXT: s_lshr_b32 s1, s1, s5
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 8
+; GFX9-NEXT: s_or_b32 s1, s4, s1
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s2, s3, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_or_b32 s0, s0, s2
; GFX9-NEXT: s_lshl_b32 s1, s1, 24
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
@@ -1075,100 +1070,98 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-LABEL: s_fshl_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s6, s1, 8
-; GFX10-NEXT: s_lshr_b32 s7, s1, 16
-; GFX10-NEXT: s_lshr_b32 s8, s1, 24
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshr_b32 s9, s2, 8
-; GFX10-NEXT: s_lshr_b32 s10, s2, 16
-; GFX10-NEXT: s_lshr_b32 s11, s2, 24
+; GFX10-NEXT: s_and_b32 s11, s1, 0xff
+; GFX10-NEXT: s_lshr_b32 s8, s2, 8
+; GFX10-NEXT: s_lshr_b32 s9, s2, 16
+; GFX10-NEXT: s_lshr_b32 s10, s2, 24
; GFX10-NEXT: s_and_b32 s12, s2, 7
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_lshr_b32 s11, s11, 1
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s6, s6, 0xff
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_and_b32 s2, s6, 0xff
-; GFX10-NEXT: s_and_b32 s6, s9, 7
-; GFX10-NEXT: s_lshr_b32 s2, s2, 1
-; GFX10-NEXT: s_andn2_b32 s9, 7, s9
+; GFX10-NEXT: s_lshr_b32 s2, s11, s2
+; GFX10-NEXT: s_and_b32 s11, s8, 7
+; GFX10-NEXT: s_lshr_b32 s6, s6, 1
+; GFX10-NEXT: s_andn2_b32 s8, 7, s8
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
+; GFX10-NEXT: s_lshr_b32 s7, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s3, s3, s6
-; GFX10-NEXT: s_lshr_b32 s2, s2, s9
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_or_b32 s1, s3, s2
-; GFX10-NEXT: s_and_b32 s2, s7, 0xff
-; GFX10-NEXT: s_and_b32 s3, s10, 7
-; GFX10-NEXT: s_lshr_b32 s2, s2, 1
-; GFX10-NEXT: s_andn2_b32 s6, 7, s10
-; GFX10-NEXT: s_lshl_b32 s3, s4, s3
-; GFX10-NEXT: s_lshr_b32 s2, s2, s6
-; GFX10-NEXT: s_and_b32 s4, s11, 7
-; GFX10-NEXT: s_lshr_b32 s6, s8, 1
-; GFX10-NEXT: s_andn2_b32 s7, 7, s11
-; GFX10-NEXT: s_lshl_b32 s4, s5, s4
-; GFX10-NEXT: s_lshr_b32 s5, s6, s7
-; GFX10-NEXT: s_or_b32 s2, s3, s2
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_or_b32 s3, s4, s5
-; GFX10-NEXT: s_and_b32 s0, s0, 0xff
-; GFX10-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10-NEXT: s_lshl_b32 s3, s3, s11
+; GFX10-NEXT: s_lshr_b32 s6, s6, s8
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_or_b32 s2, s3, s6
+; GFX10-NEXT: s_and_b32 s3, s7, 0xff
+; GFX10-NEXT: s_and_b32 s6, s9, 7
+; GFX10-NEXT: s_lshr_b32 s3, s3, 1
+; GFX10-NEXT: s_andn2_b32 s7, 7, s9
+; GFX10-NEXT: s_lshl_b32 s4, s4, s6
+; GFX10-NEXT: s_lshr_b32 s3, s3, s7
+; GFX10-NEXT: s_and_b32 s6, s10, 7
+; GFX10-NEXT: s_lshr_b32 s1, s1, 25
+; GFX10-NEXT: s_andn2_b32 s7, 7, s10
+; GFX10-NEXT: s_lshl_b32 s5, s5, s6
+; GFX10-NEXT: s_lshr_b32 s1, s1, s7
+; GFX10-NEXT: s_or_b32 s3, s4, s3
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_lshl_b32 s1, s2, 16
-; GFX10-NEXT: s_and_b32 s2, s3, 0xff
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_lshl_b32 s1, s2, 24
+; GFX10-NEXT: s_or_b32 s1, s5, s1
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10-NEXT: s_and_b32 s3, s3, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s2, s3, 16
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 24
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_v4i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
-; GFX11-NEXT: s_lshr_b32 s7, s1, 16
-; GFX11-NEXT: s_lshr_b32 s8, s1, 24
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshr_b32 s9, s2, 8
-; GFX11-NEXT: s_lshr_b32 s10, s2, 16
-; GFX11-NEXT: s_lshr_b32 s11, s2, 24
+; GFX11-NEXT: s_and_b32 s11, s1, 0xff
+; GFX11-NEXT: s_lshr_b32 s8, s2, 8
+; GFX11-NEXT: s_lshr_b32 s9, s2, 16
+; GFX11-NEXT: s_lshr_b32 s10, s2, 24
; GFX11-NEXT: s_and_b32 s12, s2, 7
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_lshr_b32 s11, s11, 1
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s6, s6, 0xff
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_and_b32 s2, s6, 0xff
-; GFX11-NEXT: s_and_b32 s6, s9, 7
-; GFX11-NEXT: s_lshr_b32 s2, s2, 1
-; GFX11-NEXT: s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT: s_lshr_b32 s2, s11, s2
+; GFX11-NEXT: s_and_b32 s11, s8, 7
+; GFX11-NEXT: s_lshr_b32 s6, s6, 1
+; GFX11-NEXT: s_and_not1_b32 s8, 7, s8
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
+; GFX11-NEXT: s_lshr_b32 s7, s1, 16
; GFX11-NEXT: s_lshl_b32 s0, s0, s12
-; GFX11-NEXT: s_lshl_b32 s3, s3, s6
-; GFX11-NEXT: s_lshr_b32 s2, s2, s9
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_and_b32 s2, s7, 0xff
-; GFX11-NEXT: s_and_b32 s3, s10, 7
-; GFX11-NEXT: s_lshr_b32 s2, s2, 1
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT: s_lshl_b32 s3, s4, s3
-; GFX11-NEXT: s_lshr_b32 s2, s2, s6
-; GFX11-NEXT: s_and_b32 s4, s11, 7
-; GFX11-NEXT: s_lshr_b32 s6, s8, 1
-; GFX11-NEXT: s_and_not1_b32 s7, 7, s11
-; GFX11-NEXT: s_lshl_b32 s4, s5, s4
-; GFX11-NEXT: s_lshr_b32 s5, s6, s7
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_or_b32 s3, s4, s5
-; GFX11-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_lshl_b32 s3, s3, s11
+; GFX11-NEXT: s_lshr_b32 s6, s6, s8
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_or_b32 s2, s3, s6
+; GFX11-NEXT: s_and_b32 s3, s7, 0xff
+; GFX11-NEXT: s_and_b32 s6, s9, 7
+; GFX11-NEXT: s_lshr_b32 s3, s3, 1
+; GFX11-NEXT: s_and_not1_b32 s7, 7, s9
+; GFX11-NEXT: s_lshl_b32 s4, s4, s6
+; GFX11-NEXT: s_lshr_b32 s3, s3, s7
+; GFX11-NEXT: s_and_b32 s6, s10, 7
+; GFX11-NEXT: s_lshr_b32 s1, s1, 25
+; GFX11-NEXT: s_and_not1_b32 s7, 7, s10
+; GFX11-NEXT: s_lshl_b32 s5, s5, s6
+; GFX11-NEXT: s_lshr_b32 s1, s1, s7
+; GFX11-NEXT: s_or_b32 s3, s4, s3
; GFX11-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s2, 16
-; GFX11-NEXT: s_and_b32 s2, s3, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s2, 24
+; GFX11-NEXT: s_or_b32 s1, s5, s1
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s2, s3, 16
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s1, s1, 24
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -1184,38 +1177,34 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX6-LABEL: v_fshl_v4i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2
; GFX6-NEXT: v_and_b32_e32 v9, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
; GFX6-NEXT: v_bfe_u32 v9, v1, 1, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v6
-; GFX6-NEXT: v_not_b32_e32 v6, v6
; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3
; GFX6-NEXT: v_bfe_u32 v3, v1, 8, 8
-; GFX6-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT: v_bfi_b32 v6, v6, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: v_and_b32_e32 v3, 7, v7
-; GFX6-NEXT: v_not_b32_e32 v6, v7
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4
; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8
-; GFX6-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT: v_bfi_b32 v6, v7, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v8
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
; GFX6-NEXT: v_and_b32_e32 v4, 7, v8
-; GFX6-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX6-NEXT: v_bfi_b32 v6, v8, 0, 7
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5
@@ -1255,18 +1244,18 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX8-NEXT: v_mov_b32_e32 v7, 0xff
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
; GFX8-NEXT: v_mov_b32_e32 v4, 7
-; GFX8-NEXT: v_mov_b32_e32 v9, -1
+; GFX8-NEXT: v_mov_b32_e32 v8, -1
; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_sdwa v9, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7
-; GFX8-NEXT: v_and_b32_e32 v10, 7, v10
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_and_b32_e32 v9, 7, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v9, v7
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
@@ -1305,21 +1294,21 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX9-NEXT: v_mov_b32_e32 v7, 0xff
; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v4, 7
-; GFX9-NEXT: v_mov_b32_e32 v10, -1
+; GFX9-NEXT: v_mov_b32_e32 v9, -1
; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v8, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT: v_lshrrev_b16_e32 v9, 1, v9
-; GFX9-NEXT: v_and_b32_e32 v11, 7, v11
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b16_e32 v8, 1, v8
+; GFX9-NEXT: v_and_b32_e32 v10, 7, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b16_e32 v9, v11, v9
+; GFX9-NEXT: v_lshrrev_b16_e32 v8, v10, v8
; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
-; GFX9-NEXT: v_or_b32_e32 v5, v5, v9
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v8
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1334,111 +1323,109 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-LABEL: v_fshl_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX10-NEXT: v_and_b32_e32 v8, 7, v2
-; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v1
-; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2
-; GFX10-NEXT: v_and_b32_e32 v11, 7, v6
+; GFX10-NEXT: v_and_b32_e32 v7, 7, v2
+; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1
+; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2
+; GFX10-NEXT: v_and_b32_e32 v10, 7, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1
-; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT: v_lshrrev_b16 v8, 1, v9
-; GFX10-NEXT: v_and_b32_e32 v9, 7, v10
-; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0
+; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
+; GFX10-NEXT: v_lshrrev_b16 v7, 1, v8
+; GFX10-NEXT: v_and_b32_e32 v8, 7, v9
+; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3
+; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v11
; GFX10-NEXT: v_mov_b32_e32 v10, 0xff
; GFX10-NEXT: v_mov_b32_e32 v11, -1
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6
-; GFX10-NEXT: v_mov_b32_e32 v13, 7
-; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5
+; GFX10-NEXT: v_mov_b32_e32 v12, 7
+; GFX10-NEXT: v_lshrrev_b16 v9, 1, v9
+; GFX10-NEXT: v_and_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_xor_b32_sdwa v13, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7
-; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT: v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT: v_and_b32_e32 v10, 7, v10
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
+; GFX10-NEXT: v_and_b32_sdwa v14, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10
+; GFX10-NEXT: v_and_b32_e32 v13, 7, v13
+; GFX10-NEXT: v_and_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX10-NEXT: v_and_b32_e32 v11, 7, v11
-; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7
+; GFX10-NEXT: v_lshrrev_b16 v5, v5, v9
; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4
-; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1
-; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5
-; GFX10-NEXT: v_lshrrev_b16 v5, v11, v12
-; GFX10-NEXT: v_lshrrev_b16 v7, v9, v8
-; GFX10-NEXT: v_or_b32_e32 v3, v3, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, 8
-; GFX10-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v2, v5
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshrrev_b16 v9, v13, v10
+; GFX10-NEXT: v_lshlrev_b16 v2, v2, v6
+; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1
+; GFX10-NEXT: v_lshrrev_b16 v6, v8, v7
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, 8
+; GFX10-NEXT: v_or_b32_e32 v4, v4, v9
+; GFX10-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v6
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshl_v4i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GFX11-NEXT: v_and_b32_e32 v9, 7, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; GFX11-NEXT: v_xor_b32_e32 v13, -1, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v2
+; GFX11-NEXT: v_and_b32_e32 v8, 7, v8
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-NEXT: v_lshrrev_b16 v6, 1, v6
; GFX11-NEXT: v_and_b32_e32 v13, 7, v13
-; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3
-; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6
-; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11
+; GFX11-NEXT: v_lshlrev_b16 v3, v8, v3
+; GFX11-NEXT: v_xor_b32_e32 v8, -1, v9
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX11-NEXT: v_and_b32_e32 v12, 7, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6
+; GFX11-NEXT: v_xor_b32_e32 v13, -1, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 7, v2
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v1
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
-; GFX11-NEXT: v_and_b32_e32 v10, 7, v10
-; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7
; GFX11-NEXT: v_and_b32_e32 v9, 7, v9
-; GFX11-NEXT: v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8
+; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7
+; GFX11-NEXT: v_and_b32_e32 v8, 7, v8
+; GFX11-NEXT: v_and_b32_e32 v10, 7, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX11-NEXT: v_and_b32_e32 v13, 7, v13
-; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1
+; GFX11-NEXT: v_lshrrev_b16 v12, 1, v12
; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
; GFX11-NEXT: v_or_b32_e32 v3, v3, v6
-; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4
-; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7
-; GFX11-NEXT: v_lshlrev_b16 v5, v11, v5
-; GFX11-NEXT: v_lshrrev_b16 v7, v13, v8
-; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0
-; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-NEXT: v_or_b32_e32 v3, v4, v6
-; GFX11-NEXT: v_or_b32_e32 v4, v5, v7
+; GFX11-NEXT: v_lshlrev_b16 v4, v9, v4
+; GFX11-NEXT: v_lshrrev_b16 v6, v8, v7
+; GFX11-NEXT: v_lshlrev_b16 v5, v10, v5
+; GFX11-NEXT: v_lshrrev_b16 v1, v13, v1
+; GFX11-NEXT: v_lshlrev_b16 v0, v11, v0
+; GFX11-NEXT: v_lshrrev_b16 v2, v2, v12
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT: v_or_b32_e32 v1, v5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3
-; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -3686,22 +3673,21 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshl_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s1, 16
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshr_b32 s5, s2, 16
-; GFX8-NEXT: s_and_b32 s6, s2, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s2
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_and_b32 s5, s2, 15
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s5, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s5
-; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_lshr_b32 s3, s4, 1
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s5
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX8-NEXT: s_lshr_b32 s4, s2, 16
+; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: s_lshr_b32 s5, s5, 1
+; GFX8-NEXT: s_lshr_b32 s2, s5, s2
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s4, 15
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_lshr_b32 s1, s1, 17
+; GFX8-NEXT: s_lshl_b32 s2, s3, s2
+; GFX8-NEXT: s_lshr_b32 s1, s1, s4
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
@@ -3813,13 +3799,12 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
; GFX8-NEXT: v_mov_b32_e32 v4, 15
-; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v5, -1
+; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v4, 1
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 17, v1
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -3886,14 +3871,12 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, 8
; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -3964,11 +3947,10 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 15
; GFX8-NEXT: v_mov_b32_e32 v3, -1
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX8-NEXT: s_lshr_b32 s0, s3, 1
+; GFX8-NEXT: s_lshr_b32 s0, s1, 17
; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2
; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -4058,11 +4040,10 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
; GFX8-NEXT: s_and_b32 s0, s3, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s3
-; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 17, v0
; GFX8-NEXT: s_lshl_b32 s0, s2, s0
; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
@@ -4142,21 +4123,20 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: v_fshl_v2i16_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_and_b32 s4, s1, 15
+; GFX8-NEXT: s_and_b32 s3, s1, 15
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, s3, v0
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s2, s1, 16
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0
-; GFX8-NEXT: s_lshr_b32 s0, s0, s1
-; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
-; GFX8-NEXT: s_and_b32 s0, s3, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: s_lshr_b32 s0, s2, 1
+; GFX8-NEXT: s_lshr_b32 s3, s3, 1
+; GFX8-NEXT: s_lshr_b32 s1, s3, s1
+; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
+; GFX8-NEXT: s_and_b32 s1, s2, 15
+; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_lshr_b32 s0, s0, 17
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: s_lshr_b32 s0, s0, s1
+; GFX8-NEXT: s_lshr_b32 s0, s0, s2
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -4256,23 +4236,22 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshl_v3i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s7, s2, 16
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_lshr_b32 s8, s4, 16
-; GFX8-NEXT: s_and_b32 s9, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_lshr_b32 s2, s2, 1
+; GFX8-NEXT: s_and_b32 s8, s4, 15
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, s9
-; GFX8-NEXT: s_lshr_b32 s2, s2, s4
-; GFX8-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NEXT: s_and_b32 s2, s8, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s8
-; GFX8-NEXT: s_lshl_b32 s2, s6, s2
-; GFX8-NEXT: s_lshr_b32 s6, s7, 1
-; GFX8-NEXT: s_lshr_b32 s4, s6, s4
+; GFX8-NEXT: s_lshl_b32 s0, s0, s8
+; GFX8-NEXT: s_and_b32 s8, 0xffff, s2
+; GFX8-NEXT: s_lshr_b32 s7, s4, 16
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_lshr_b32 s8, s8, 1
+; GFX8-NEXT: s_lshr_b32 s4, s8, s4
+; GFX8-NEXT: s_or_b32 s0, s0, s4
+; GFX8-NEXT: s_and_b32 s4, s7, 15
+; GFX8-NEXT: s_andn2_b32 s7, 15, s7
+; GFX8-NEXT: s_lshr_b32 s2, s2, 17
+; GFX8-NEXT: s_lshl_b32 s4, s6, s4
+; GFX8-NEXT: s_lshr_b32 s2, s2, s7
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: s_and_b32 s4, s5, 15
; GFX8-NEXT: s_andn2_b32 s5, 15, s5
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
@@ -4469,13 +4448,12 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8
; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
; GFX8-NEXT: v_mov_b32_e32 v7, 15
-; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v8, -1
+; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v7, 1
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 17, v2
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v5
@@ -4593,39 +4571,37 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
;
; GFX8-LABEL: s_fshl_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_lshr_b32 s10, s4, 16
-; GFX8-NEXT: s_and_b32 s12, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_lshr_b32 s2, s2, 1
+; GFX8-NEXT: s_and_b32 s10, s4, 15
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, s12
-; GFX8-NEXT: s_lshr_b32 s2, s2, s4
-; GFX8-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NEXT: s_and_b32 s2, s10, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s10
-; GFX8-NEXT: s_lshl_b32 s2, s6, s2
-; GFX8-NEXT: s_lshr_b32 s6, s8, 1
-; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_lshr_b32 s4, s6, s4
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_lshr_b32 s11, s5, 16
-; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_lshl_b32 s0, s0, s10
+; GFX8-NEXT: s_and_b32 s10, 0xffff, s2
+; GFX8-NEXT: s_lshr_b32 s8, s4, 16
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_lshr_b32 s10, s10, 1
+; GFX8-NEXT: s_lshr_b32 s4, s10, s4
+; GFX8-NEXT: s_or_b32 s0, s0, s4
+; GFX8-NEXT: s_and_b32 s4, s8, 15
+; GFX8-NEXT: s_andn2_b32 s8, 15, s8
+; GFX8-NEXT: s_lshr_b32 s2, s2, 17
+; GFX8-NEXT: s_lshl_b32 s4, s6, s4
+; GFX8-NEXT: s_lshr_b32 s2, s2, s8
+; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: s_and_b32 s4, s5, 15
-; GFX8-NEXT: s_andn2_b32 s5, 15, s5
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s3
+; GFX8-NEXT: s_lshr_b32 s9, s5, 16
+; GFX8-NEXT: s_andn2_b32 s5, 15, s5
+; GFX8-NEXT: s_lshr_b32 s4, s4, 1
+; GFX8-NEXT: s_lshr_b32 s4, s4, s5
+; GFX8-NEXT: s_or_b32 s1, s1, s4
+; GFX8-NEXT: s_and_b32 s4, s9, 15
+; GFX8-NEXT: s_andn2_b32 s5, 15, s9
+; GFX8-NEXT: s_lshr_b32 s3, s3, 17
+; GFX8-NEXT: s_lshl_b32 s4, s7, s4
; GFX8-NEXT: s_lshr_b32 s3, s3, s5
-; GFX8-NEXT: s_or_b32 s1, s1, s3
-; GFX8-NEXT: s_and_b32 s3, s11, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s11
-; GFX8-NEXT: s_lshr_b32 s5, s9, 1
-; GFX8-NEXT: s_lshl_b32 s3, s7, s3
-; GFX8-NEXT: s_lshr_b32 s4, s5, s4
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_or_b32 s3, s3, s4
+; GFX8-NEXT: s_or_b32 s3, s4, s3
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_or_b32 s0, s0, s2
@@ -4810,26 +4786,25 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8
; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
; GFX8-NEXT: v_mov_b32_e32 v7, 15
-; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v9, -1
+; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v8, 1
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 17, v2
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2
; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v5
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: v_lshrrev_b16_e32 v10, 1, v3
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v10
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8
; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
; GFX8-NEXT: v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 17, v3
; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
@@ -5023,10 +4998,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX6-LABEL: v_fshl_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 63, v4
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: v_not_b32_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 63
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
@@ -5036,10 +5010,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX8-LABEL: v_fshl_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v5, 63, v4
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: v_not_b32_e32 v4, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX8-NEXT: v_bfi_b32 v4, v4, 0, 63
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
@@ -5049,10 +5022,9 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX9-LABEL: v_fshl_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_not_b32_e32 v4, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX9-NEXT: v_bfi_b32 v4, v4, 0, 63
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
@@ -5062,12 +5034,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX10-LABEL: v_fshl_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v5, v4
; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -5075,16 +5046,14 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX11-LABEL: v_fshl_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v5, v4
; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX11-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5204,10 +5173,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
; GFX6-LABEL: v_fshl_i64_ssv:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_and_b32_e32 v1, 63, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v0
; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1
; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT: v_bfi_b32 v0, v0, 0, 63
; GFX6-NEXT: v_lshr_b64 v[3:4], s[0:1], v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v3
; GFX6-NEXT: v_or_b32_e32 v1, v2, v4
@@ -5216,10 +5184,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
; GFX8-LABEL: v_fshl_i64_ssv:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_and_b32_e32 v1, 63, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, 63
; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1]
; GFX8-NEXT: v_or_b32_e32 v0, v1, v3
; GFX8-NEXT: v_or_b32_e32 v1, v2, v4
@@ -5228,10 +5195,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
; GFX9-LABEL: v_fshl_i64_ssv:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_and_b32_e32 v1, 63, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v0
; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT: v_bfi_b32 v0, v0, 0, 63
; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1]
; GFX9-NEXT: v_or_b32_e32 v0, v1, v3
; GFX9-NEXT: v_or_b32_e32 v1, v2, v4
@@ -5239,11 +5205,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX10-LABEL: v_fshl_i64_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_not_b32_e32 v1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 63, v0
+; GFX10-NEXT: v_bfi_b32 v2, v0, 0, 63
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
@@ -5251,16 +5216,14 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX11-LABEL: v_fshl_i64_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_not_b32_e32 v1, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 63, v0
+; GFX11-NEXT: v_bfi_b32 v2, v0, 0, 63
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5466,18 +5429,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX6-LABEL: v_fshl_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v9, 63, v8
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1
-; GFX6-NEXT: v_not_b32_e32 v8, v8
-; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
+; GFX6-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX6-NEXT: v_bfi_b32 v8, v8, 0, 63
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 63, v10
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
-; GFX6-NEXT: v_not_b32_e32 v4, v10
-; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5487,18 +5448,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX8-LABEL: v_fshl_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v9, 63, v8
; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX8-NEXT: v_not_b32_e32 v8, v8
-; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
+; GFX8-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX8-NEXT: v_bfi_b32 v8, v8, 0, 63
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: v_and_b32_e32 v4, 63, v10
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT: v_not_b32_e32 v4, v10
-; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5508,18 +5467,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX9-LABEL: v_fshl_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX9-NEXT: v_not_b32_e32 v8, v8
-; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
+; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX9-NEXT: v_bfi_b32 v8, v8, 0, 63
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
; GFX9-NEXT: v_and_b32_e32 v4, 63, v10
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT: v_not_b32_e32 v4, v10
-; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
; GFX9-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5529,18 +5486,16 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX10-LABEL: v_fshl_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v9, v8
-; GFX10-NEXT: v_not_b32_e32 v11, v10
; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
-; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX10-NEXT: v_bfi_b32 v8, v8, 0, 63
+; GFX10-NEXT: v_and_b32_e32 v11, 63, v10
+; GFX10-NEXT: v_bfi_b32 v10, v10, 0, 63
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5550,20 +5505,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX11-LABEL: v_fshl_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v9, v8
-; GFX11-NEXT: v_not_b32_e32 v11, v10
; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX11-NEXT: v_and_b32_e32 v10, 63, v10
-; GFX11-NEXT: v_and_b32_e32 v11, 63, v11
+; GFX11-NEXT: v_and_b32_e32 v9, 63, v8
+; GFX11-NEXT: v_bfi_b32 v8, v8, 0, 63
+; GFX11-NEXT: v_and_b32_e32 v11, 63, v10
+; GFX11-NEXT: v_bfi_b32 v10, v10, 0, 63
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX11-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7]
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v4
; GFX11-NEXT: v_or_b32_e32 v1, v1, v5
@@ -5818,32 +5771,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-LABEL: v_fshl_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15
-; GFX6-NEXT: v_add_i32_e32 v17, vcc, 0xffffffc0, v15
+; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v8
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v16
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, 0xffffffc0, v16
; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9
-; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15
-; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v17
+; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v16
+; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v16
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v18
; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
; GFX6-NEXT: v_or_b32_e32 v10, v10, v12
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1
+; GFX6-NEXT: v_mov_b32_e32 v15, 0x7f
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6
-; GFX6-NEXT: v_not_b32_e32 v4, v8
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1
-; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4
-; GFX6-NEXT: v_not_b32_e32 v16, 63
+; GFX6-NEXT: v_bfi_b32 v14, v8, 0, v15
+; GFX6-NEXT: v_not_b32_e32 v17, 63
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14
-; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16
+; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v17
; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14
; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6
; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14
@@ -5867,32 +5820,32 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX8-LABEL: v_fshl_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffffc0, v15
+; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v8
+; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v16
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffffc0, v16
; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v18, v[0:1]
; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
; GFX8-NEXT: v_or_b32_e32 v10, v10, v12
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v15, 0x7f
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6
-; GFX8-NEXT: v_not_b32_e32 v4, v8
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4
-; GFX8-NEXT: v_not_b32_e32 v16, 63
+; GFX8-NEXT: v_bfi_b32 v14, v8, 0, v15
+; GFX8-NEXT: v_not_b32_e32 v17, 63
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v17
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3]
; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3]
@@ -5916,27 +5869,27 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX9-LABEL: v_fshl_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15
-; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15
+; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v8
+; GFX9-NEXT: v_sub_u32_e32 v9, 64, v16
+; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16
; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[11:12], v16, v[2:3]
+; GFX9-NEXT: v_lshlrev_b64 v[13:14], v16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1]
; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
; GFX9-NEXT: v_or_b32_e32 v10, v10, v12
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX9-NEXT: v_mov_b32_e32 v15, 0x7f
; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT: v_not_b32_e32 v4, v8
; GFX9-NEXT: v_cndmask_b32_e32 v13, v9, v3, vcc
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4
+; GFX9-NEXT: v_bfi_b32 v14, v8, 0, v15
; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1
; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14
; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14
@@ -5963,99 +5916,96 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-LABEL: v_fshl_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8
-; GFX10-NEXT: v_not_b32_e32 v12, v8
+; GFX10-NEXT: v_and_b32_e32 v17, 0x7f, v8
; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18
-; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v12
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3]
+; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX10-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7]
+; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v17
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3]
+; GFX10-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5
-; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19
-; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
-; GFX10-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
-; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX10-NEXT: v_or_b32_e32 v11, v11, v9
-; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18
-; GFX10-NEXT: v_or_b32_e32 v0, v14, v16
-; GFX10-NEXT: v_or_b32_e32 v10, v15, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19
-; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
-; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
+; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v18
+; GFX10-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17
+; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18
+; GFX10-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10]
+; GFX10-NEXT: v_or_b32_e32 v11, v11, v7
+; GFX10-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5]
+; GFX10-NEXT: v_or_b32_e32 v8, v12, v8
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v18
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10]
+; GFX10-NEXT: v_or_b32_e32 v0, v6, v15
+; GFX10-NEXT: v_or_b32_e32 v6, v7, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s5
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10]
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v6, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5
-; GFX10-NEXT: v_or_b32_e32 v0, v12, v4
-; GFX10-NEXT: v_or_b32_e32 v1, v7, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v1, s5
+; GFX10-NEXT: v_or_b32_e32 v0, v13, v4
+; GFX10-NEXT: v_or_b32_e32 v1, v9, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX10-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshl_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8
-; GFX11-NEXT: v_not_b32_e32 v12, v8
+; GFX11-NEXT: v_and_b32_e32 v17, 0x7f, v8
; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18
-; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v12
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
-; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5
-; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19
-; GFX11-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
-; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX11-NEXT: v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19
-; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v0, v14, v16
-; GFX11-NEXT: v_or_b32_e32 v10, v15, v17
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18
+; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX11-NEXT: v_lshrrev_b64 v[9:10], 1, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
+; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v17
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v17, v[2:3]
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0xffffffc0, v17
+; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5
+; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v18
+; GFX11-NEXT: v_lshrrev_b64 v[11:12], v11, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[13:14], v17, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v18
+; GFX11-NEXT: v_lshlrev_b64 v[15:16], v16, v[9:10]
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v7
+; GFX11-NEXT: v_lshrrev_b64 v[6:7], v18, v[4:5]
+; GFX11-NEXT: v_or_b32_e32 v8, v12, v8
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v18
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v11, vcc_lo
+; GFX11-NEXT: v_lshrrev_b64 v[11:12], v19, v[9:10]
+; GFX11-NEXT: v_or_b32_e32 v0, v6, v15
+; GFX11-NEXT: v_or_b32_e32 v6, v7, v16
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v8, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v18, v[9:10]
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s1
+; GFX11-NEXT: v_cndmask_b32_e32 v9, 0, v14, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v0, v12, v4
-; GFX11-NEXT: v_or_b32_e32 v1, v7, v5
+; GFX11-NEXT: v_or_b32_e32 v0, v13, v4
+; GFX11-NEXT: v_or_b32_e32 v1, v9, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX11-NEXT: v_or_b32_e32 v3, v3, v8
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
ret i128 %result
@@ -6064,264 +6014,260 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
; GFX6-LABEL: v_fshl_i128_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT: v_and_b32_e32 v8, 0x7f, v0
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v8
; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1
-; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7
-; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0xffffffc0, v7
-; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7
+; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v8
+; GFX6-NEXT: v_add_i32_e32 v10, vcc, 0xffffffc0, v8
+; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v8
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
; GFX6-NEXT: v_or_b32_e32 v4, v2, v4
-; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v9
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT: v_not_b32_e32 v0, v0
+; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v10
+; GFX6-NEXT: v_mov_b32_e32 v7, 0x7f
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
; GFX6-NEXT: s_mov_b32 s8, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX6-NEXT: v_mov_b32_e32 v3, s2
; GFX6-NEXT: v_mov_b32_e32 v4, s3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX6-NEXT: s_lshl_b32 s9, s6, 31
-; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc
+; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v7
+; GFX6-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11
-; GFX6-NEXT: v_not_b32_e32 v8, 63
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7
+; GFX6-NEXT: v_not_b32_e32 v9, 63
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7
; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v9
; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v8
-; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v11
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v9
+; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v10, v0
; GFX6-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v8, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v11, v3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_i128_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT: v_and_b32_e32 v8, 0x7f, v0
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v8
; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffffc0, v7
-; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffffc0, v8
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1]
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
; GFX8-NEXT: v_or_b32_e32 v4, v2, v4
-; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT: v_not_b32_e32 v0, v0
+; GFX8-NEXT: v_lshlrev_b64 v[1:2], v10, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x7f
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
; GFX8-NEXT: s_mov_b32 s8, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX8-NEXT: s_lshl_b32 s9, s6, 31
-; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc
+; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v2, v4, vcc
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11
-; GFX8-NEXT: v_not_b32_e32 v8, 63
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7
+; GFX8-NEXT: v_not_b32_e32 v9, 63
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v9
; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[2:3]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v9, s[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v10, v0
; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v8, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v11, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshl_i128_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT: v_and_b32_e32 v8, 0x7f, v0
+; GFX9-NEXT: v_sub_u32_e32 v1, 64, v8
; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
-; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7
-; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[3:4], v8, s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v8, s[0:1]
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
; GFX9-NEXT: v_or_b32_e32 v4, v2, v4
-; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT: v_not_b32_e32 v0, v0
+; GFX9-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x7f
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX9-NEXT: s_lshl_b32 s9, s6, 31
-; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v7
; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc
; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10
+; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v7
; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, v8, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v9, v0
; GFX9-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX9-NEXT: v_or_b32_e32 v3, v9, v3
+; GFX9-NEXT: v_or_b32_e32 v2, v8, v2
+; GFX9-NEXT: v_or_b32_e32 v3, v10, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshl_i128_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT: v_not_b32_e32 v6, v0
+; GFX10-NEXT: v_and_b32_e32 v11, 0x7f, v0
+; GFX10-NEXT: v_bfi_b32 v12, v0, 0, 0x7f
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
; GFX10-NEXT: s_lshl_b32 s9, s6, 31
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v6
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v11
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11
+; GFX10-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3]
+; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12
; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1]
; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1]
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9]
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v13
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7]
-; GFX10-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX10-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v1
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9]
+; GFX10-NEXT: v_or_b32_e32 v4, v4, v2
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX10-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX10-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX10-NEXT: v_or_b32_e32 v9, v1, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
+; GFX10-NEXT: v_or_b32_e32 v0, v5, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshl_i128_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT: v_not_b32_e32 v6, v0
-; GFX11-NEXT: s_lshl_b32 s9, s6, 31
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
+; GFX11-NEXT: v_and_b32_e32 v11, 0x7f, v0
+; GFX11-NEXT: v_bfi_b32 v12, v0, 0, 0x7f
; GFX11-NEXT: s_mov_b32 s8, 0
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v12
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_lshl_b32 s9, s6, 31
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12
; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v7, s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v13
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v2, s[6:7]
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[8:9]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v13
+; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_add_nc_u32 v13, 0xffffffc0, v12
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v11
+; GFX11-NEXT: v_lshlrev_b64 v[1:2], v11, s[2:3]
+; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s2, s0
-; GFX11-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX11-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX11-NEXT: v_lshrrev_b64 v[3:4], v3, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v4, v4, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffc0, v11
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v7, s[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v9
+; GFX11-NEXT: v_or_b32_e32 v9, v1, v10
+; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX11-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s2, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s3, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX11-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT: v_or_b32_e32 v0, v5, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v9
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@@ -7445,185 +7391,183 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-LABEL: v_fshl_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23
-; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17
-; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1
-; GFX6-NEXT: v_not_b32_e32 v16, v16
-; GFX6-NEXT: v_or_b32_e32 v21, v17, v21
-; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10
-; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1
-; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT: v_or_b32_e32 v9, v9, v17
-; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24
-; GFX6-NEXT: v_or_b32_e32 v22, v18, v22
+; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16
+; GFX6-NEXT: v_not_b32_e32 v18, 63
+; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19
+; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v18
+; GFX6-NEXT: v_lshr_b64 v[23:24], v[0:1], v23
+; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v19
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v27
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX6-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX6-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1
+; GFX6-NEXT: v_mov_b32_e32 v17, 0x7f
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1
+; GFX6-NEXT: v_bfi_b32 v10, v16, 0, v17
+; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc
+; GFX6-NEXT: v_add_i32_e32 v16, vcc, v10, v18
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v10
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v10
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[0:1], v10
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v21
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v16
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX6-NEXT: v_or_b32_e32 v11, v11, v22
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[6:7], v16
+; GFX6-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX6-NEXT: v_add_i32_e32 v19, vcc, v16, v18
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16
+; GFX6-NEXT: v_or_b32_e32 v16, v10, v21
+; GFX6-NEXT: v_or_b32_e32 v21, v11, v22
+; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v19
+; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5]
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[12:13], 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v14
+; GFX6-NEXT: v_or_b32_e32 v9, v9, v10
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], 1
+; GFX6-NEXT: v_bfi_b32 v14, v20, 0, v17
+; GFX6-NEXT: v_add_i32_e32 v18, vcc, v14, v18
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v14
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX6-NEXT: v_lshr_b64 v[12:13], v[10:11], v14
+; GFX6-NEXT: v_lshr_b64 v[14:15], v[8:9], v14
; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16
-; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v24
-; GFX6-NEXT: v_not_b32_e32 v25, 63
-; GFX6-NEXT: v_or_b32_e32 v18, v18, v16
-; GFX6-NEXT: v_add_i32_e32 v16, vcc, v23, v25
-; GFX6-NEXT: v_or_b32_e32 v19, v19, v17
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v24, v25
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0
-; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v26, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v17, v8
-; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v20
-; GFX6-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
-; GFX6-NEXT: v_or_b32_e32 v1, v18, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 64, v17
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v3
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17
-; GFX6-NEXT: v_or_b32_e32 v3, v16, v19
-; GFX6-NEXT: v_add_i32_e32 v16, vcc, v17, v25
-; GFX6-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v17
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; GFX6-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14
-; GFX6-NEXT: v_not_b32_e32 v8, v20
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT: v_and_b32_e32 v12, 0x7f, v8
-; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v12
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v12
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT: v_add_i32_e32 v13, vcc, v12, v25
-; GFX6-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v12
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v13
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v18, v5
-; GFX6-NEXT: v_or_b32_e32 v6, v17, v6
-; GFX6-NEXT: v_or_b32_e32 v7, v19, v7
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v18
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX6-NEXT: v_or_b32_e32 v15, v15, v17
+; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX6-NEXT: v_or_b32_e32 v0, v24, v0
+; GFX6-NEXT: v_or_b32_e32 v1, v25, v1
+; GFX6-NEXT: v_or_b32_e32 v3, v23, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v9
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v10
+; GFX6-NEXT: v_or_b32_e32 v7, v7, v11
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23
-; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX8-NEXT: v_not_b32_e32 v16, v16
-; GFX8-NEXT: v_or_b32_e32 v21, v17, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10
-; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT: v_or_b32_e32 v9, v9, v17
-; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24
-; GFX8-NEXT: v_or_b32_e32 v22, v18, v22
+; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16
+; GFX8-NEXT: v_not_b32_e32 v18, 63
+; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v18
+; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v27, v[0:1]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX8-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX8-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v17, 0x7f
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11]
+; GFX8-NEXT: v_bfi_b32 v10, v16, 0, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v18
+; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v10
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v10, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[2:3]
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v16, v[6:7]
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, v16, v18
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5]
+; GFX8-NEXT: v_or_b32_e32 v16, v10, v21
+; GFX8-NEXT: v_or_b32_e32 v21, v11, v22
+; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v14
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v10
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15]
+; GFX8-NEXT: v_bfi_b32 v14, v20, 0, v17
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v14, v18
+; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v14
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[10:11]
+; GFX8-NEXT: v_lshrrev_b64 v[14:15], v14, v[8:9]
; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX8-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9]
-; GFX8-NEXT: v_not_b32_e32 v25, 63
-; GFX8-NEXT: v_or_b32_e32 v18, v18, v16
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, v23, v25
-; GFX8-NEXT: v_or_b32_e32 v19, v19, v17
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
-; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v24, v25
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v26, v2
-; GFX8-NEXT: v_or_b32_e32 v2, v17, v8
-; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v20
-; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
-; GFX8-NEXT: v_or_b32_e32 v1, v18, v3
-; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 64, v17
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7]
-; GFX8-NEXT: v_or_b32_e32 v3, v16, v19
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, v17, v25
-; GFX8-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14
-; GFX8-NEXT: v_not_b32_e32 v8, v20
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT: v_and_b32_e32 v12, 0x7f, v8
-; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v12
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, v12, v25
-; GFX8-NEXT: v_or_b32_e32 v10, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v12, v[6:7]
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT: v_or_b32_e32 v5, v18, v5
-; GFX8-NEXT: v_or_b32_e32 v6, v17, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v19, v7
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v18, v[10:11]
+; GFX8-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX8-NEXT: v_or_b32_e32 v15, v15, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX8-NEXT: v_or_b32_e32 v0, v24, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v25, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v23, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v8
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v9
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v10
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v11
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_v2i128:
@@ -7632,17 +7576,17 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16
; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23
-; GFX9-NEXT: v_not_b32_e32 v16, v16
+; GFX9-NEXT: v_mov_b32_e32 v24, 0x7f
; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9
; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24
+; GFX9-NEXT: v_bfi_b32 v25, v16, 0, v24
+; GFX9-NEXT: v_sub_u32_e32 v16, 64, v25
; GFX9-NEXT: v_or_b32_e32 v21, v17, v21
; GFX9-NEXT: v_or_b32_e32 v22, v18, v22
; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11]
-; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX9-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
; GFX9-NEXT: v_or_b32_e32 v18, v18, v16
; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23
@@ -7650,48 +7594,47 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v25, 0, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5]
-; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v24
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25
; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5]
-; GFX9-NEXT: v_or_b32_e32 v1, v18, v3
-; GFX9-NEXT: v_or_b32_e32 v3, v16, v9
-; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20
-; GFX9-NEXT: v_or_b32_e32 v0, v25, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT: v_or_b32_e32 v0, v26, v2
; GFX9-NEXT: v_or_b32_e32 v2, v17, v8
-; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7]
-; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16
+; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v20
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
+; GFX9-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX9-NEXT: v_sub_u32_e32 v3, 64, v17
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7]
+; GFX9-NEXT: v_or_b32_e32 v3, v16, v19
+; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v17
; GFX9-NEXT: v_or_b32_e32 v10, v8, v10
; GFX9-NEXT: v_or_b32_e32 v11, v9, v11
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v8, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc
-; GFX9-NEXT: v_not_b32_e32 v8, v20
; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX9-NEXT: v_and_b32_e32 v13, 0x7f, v8
+; GFX9-NEXT: v_bfi_b32 v13, v20, 0, v24
; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5
; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13
; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5]
@@ -7709,68 +7652,66 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v4, v17, v4
+; GFX9-NEXT: v_or_b32_e32 v4, v16, v4
; GFX9-NEXT: v_or_b32_e32 v5, v18, v5
-; GFX9-NEXT: v_or_b32_e32 v6, v16, v6
+; GFX9-NEXT: v_or_b32_e32 v6, v17, v6
; GFX9-NEXT: v_or_b32_e32 v7, v12, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshl_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16
-; GFX10-NEXT: v_not_b32_e32 v21, v16
+; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v16
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v27
-; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v21
-; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27
-; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3]
+; GFX10-NEXT: v_bfi_b32 v29, v16, 0, 0x7f
+; GFX10-NEXT: v_sub_nc_u32_e32 v21, 64, v19
+; GFX10-NEXT: v_add_nc_u32_e32 v25, 0xffffffc0, v19
+; GFX10-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3]
; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9
-; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28
-; GFX10-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX10-NEXT: v_or_b32_e32 v18, v18, v21
-; GFX10-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28
-; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
-; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v0, v19, v22
-; GFX10-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v21, v23, v25
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v27
-; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1]
+; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v29
+; GFX10-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19
+; GFX10-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9]
+; GFX10-NEXT: v_or_b32_e32 v21, v21, v23
+; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29
+; GFX10-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11]
+; GFX10-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v0, v22, v24
+; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11]
+; GFX10-NEXT: v_or_b32_e32 v19, v25, v27
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v29
+; GFX10-NEXT: v_or_b32_e32 v22, v26, v28
; GFX10-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v21, s5
-; GFX10-NEXT: v_or_b32_e32 v22, v24, v26
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v29, v2, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v19, v22, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v19, s5
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11]
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v22, s5
; GFX10-NEXT: v_cndmask_b32_e64 v22, v23, v3, s4
; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20
-; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5
-; GFX10-NEXT: v_or_b32_e32 v0, v16, v2
-; GFX10-NEXT: v_not_b32_e32 v16, v20
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v23
+; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23
+; GFX10-NEXT: v_bfi_b32 v20, v20, 0, 0x7f
+; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v0, s5
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s5
-; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v16
-; GFX10-NEXT: v_or_b32_e32 v1, v17, v3
-; GFX10-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX10-NEXT: v_or_b32_e32 v0, v30, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7]
; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9
; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15]
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20
; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
; GFX10-NEXT: v_or_b32_e32 v10, v2, v10
; GFX10-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20
@@ -7807,96 +7748,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX11-LABEL: v_fshl_v2i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16
-; GFX11-NEXT: v_not_b32_e32 v21, v16
+; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v16
; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_lshlrev_b64 v[16:17], v27, v[0:1]
-; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v27
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v27
+; GFX11-NEXT: v_bfi_b32 v29, v16, 0, 0x7f
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[17:18], v19, v[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19
; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9
; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1]
-; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v21
-; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[2:3]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v18, v18, v21
-; GFX11-NEXT: v_cndmask_b32_e32 v29, v0, v18, vcc_lo
-; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v28
-; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
-; GFX11-NEXT: v_or_b32_e32 v0, v19, v22
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v28
-; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX11-NEXT: v_lshrrev_b64 v[18:19], v21, v[10:11]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v21, v23, v25
+; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v29
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v29
+; GFX11-NEXT: v_cndmask_b32_e32 v30, 0, v17, vcc_lo
+; GFX11-NEXT: v_sub_nc_u32_e32 v21, 64, v19
+; GFX11-NEXT: v_dual_cndmask_b32 v18, 0, v18 :: v_dual_add_nc_u32 v25, 0xffffffc0, v19
+; GFX11-NEXT: v_lshlrev_b64 v[23:24], v19, v[2:3]
+; GFX11-NEXT: v_lshlrev_b64 v[27:28], v16, v[10:11]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b64 v[21:22], v21, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[25:26], v29, v[8:9]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v21, v21, v23
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v29
+; GFX11-NEXT: v_or_b32_e32 v19, v25, v27
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v0, v22, v24
+; GFX11-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11]
+; GFX11-NEXT: v_or_b32_e32 v22, v26, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0
; GFX11-NEXT: v_cndmask_b32_e32 v23, v1, v0, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v22, v24, v26
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v21, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_cndmask_b32_e64 v21, v29, v2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v19, v22, s1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29
+; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v19, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v29, v[10:11]
+; GFX11-NEXT: v_cndmask_b32_e64 v10, v17, v22, s1
; GFX11-NEXT: v_cndmask_b32_e64 v22, v23, v3, s0
; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23
-; GFX11-NEXT: v_or_b32_e32 v0, v16, v2
-; GFX11-NEXT: v_not_b32_e32 v16, v20
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc_lo
+; GFX11-NEXT: v_bfi_b32 v20, v20, 0, 0x7f
; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v23
-; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5]
-; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v16
-; GFX11-NEXT: v_or_b32_e32 v1, v17, v3
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7]
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 0xffffffc0, v23
+; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v23
+; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s1
+; GFX11-NEXT: v_or_b32_e32 v0, v30, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v23
; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9
; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v17, v[4:5]
-; GFX11-NEXT: v_or_b32_e32 v10, v2, v10
; GFX11-NEXT: v_add_nc_u32_e32 v26, 0xffffffc0, v20
+; GFX11-NEXT: v_or_b32_e32 v1, v18, v3
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v10, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[10:11], v23, v[6:7]
+; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23
; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[8:9]
+; GFX11-NEXT: v_or_b32_e32 v10, v2, v10
; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15]
; GFX11-NEXT: v_or_b32_e32 v2, v21, v24
; GFX11-NEXT: v_or_b32_e32 v11, v3, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v10, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_dual_cndmask_b32 v12, 0, v12 :: v_dual_cndmask_b32 v21, v4, v10
; GFX11-NEXT: v_lshrrev_b64 v[3:4], v26, v[14:15]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
; GFX11-NEXT: v_or_b32_e32 v10, v16, v18
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
; GFX11-NEXT: v_or_b32_e32 v16, v17, v19
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v23
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v10, s1
; GFX11-NEXT: v_lshrrev_b64 v[10:11], v20, v[14:15]
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v8, s2
-; GFX11-NEXT: v_or_b32_e32 v3, v22, v25
; GFX11-NEXT: v_cndmask_b32_e64 v8, v4, v9, s2
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v10, s1
; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v11, s1
+; GFX11-NEXT: v_or_b32_e32 v3, v22, v25
; GFX11-NEXT: v_or_b32_e32 v4, v12, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v5, v13, v8
; GFX11-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v7, v7, v10
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 238cc06fc7f7..ea6b3a3ad786 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -398,8 +398,7 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
@@ -785,19 +784,17 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX6-LABEL: v_fshr_v2i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX6-NEXT: v_and_b32_e32 v5, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT: v_bfi_b32 v4, v4, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
@@ -1187,40 +1184,36 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX6-LABEL: v_fshr_v4i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v7, 8, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v9, 24, v2
; GFX6-NEXT: v_and_b32_e32 v10, 7, v2
-; GFX6-NEXT: v_not_b32_e32 v2, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfi_b32 v2, v2, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 7, v7
-; GFX6-NEXT: v_not_b32_e32 v7, v7
-; GFX6-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX6-NEXT: v_bfi_b32 v7, v7, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3
; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8
; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7
-; GFX6-NEXT: v_not_b32_e32 v7, v8
; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1
; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
; GFX6-NEXT: v_and_b32_e32 v3, 7, v8
-; GFX6-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX6-NEXT: v_bfi_b32 v7, v8, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1
; GFX6-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX6-NEXT: v_not_b32_e32 v4, v9
; GFX6-NEXT: v_and_b32_e32 v3, 7, v9
-; GFX6-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX6-NEXT: v_bfi_b32 v4, v9, 0, 7
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5
; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5
@@ -3411,32 +3404,19 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: s_fshr_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s5, s5, 16
-; GFX6-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX6-NEXT: s_or_b32 s4, s5, s4
-; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_lshr_b32 s5, s5, 14
-; GFX6-NEXT: s_or_b32 s0, s0, s5
-; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s1, s1, 1
-; GFX6-NEXT: s_lshr_b32 s5, s5, 14
-; GFX6-NEXT: s_lshl_b32 s2, s2, 1
-; GFX6-NEXT: s_xor_b32 s4, s4, -1
-; GFX6-NEXT: s_or_b32 s1, s1, s5
-; GFX6-NEXT: s_lshr_b32 s5, s4, 16
; GFX6-NEXT: s_and_b32 s6, s4, 15
; GFX6-NEXT: s_andn2_b32 s4, 15, s4
-; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s0, s0, s6
-; GFX6-NEXT: s_lshr_b32 s2, s2, s4
-; GFX6-NEXT: s_lshl_b32 s3, s3, 1
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_lshl_b32 s0, s0, s4
+; GFX6-NEXT: s_lshr_b32 s2, s2, s6
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s2, s5, 15
; GFX6-NEXT: s_andn2_b32 s4, 15, s5
-; GFX6-NEXT: s_lshl_b32 s1, s1, s2
-; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001
-; GFX6-NEXT: s_lshr_b32 s2, s2, s4
+; GFX6-NEXT: s_lshl_b32 s1, s1, 1
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_lshl_b32 s1, s1, s4
+; GFX6-NEXT: s_lshr_b32 s2, s3, s2
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
@@ -3446,33 +3426,22 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshr_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s1
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_lshr_b32 s5, s5, 15
-; GFX8-NEXT: s_lshl_b32 s1, s1, 1
-; GFX8-NEXT: s_or_b32 s0, s0, s5
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
-; GFX8-NEXT: s_lshr_b32 s5, s4, 15
-; GFX8-NEXT: s_xor_b32 s2, s2, -1
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_or_b32 s3, s3, s5
; GFX8-NEXT: s_lshr_b32 s5, s2, 16
; GFX8-NEXT: s_and_b32 s6, s2, 15
; GFX8-NEXT: s_andn2_b32 s2, 15, s2
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
-; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_lshl_b32 s4, s4, 1
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s0, s0, s2
+; GFX8-NEXT: s_lshr_b32 s1, s1, s6
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 15
-; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
; GFX8-NEXT: s_andn2_b32 s2, 15, s5
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s3, s3, 1
+; GFX8-NEXT: s_lshl_b32 s2, s3, s2
+; GFX8-NEXT: s_lshr_b32 s1, s4, s1
+; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
@@ -3554,65 +3523,43 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
; GFX6-LABEL: v_fshr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX6-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX6-NEXT: v_bfe_u32 v5, v2, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX6-NEXT: v_bfe_u32 v5, v3, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5
-; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX6-NEXT: v_and_b32_e32 v6, 15, v4
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v2
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v5
-; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1
-; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v1
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, 1
-; GFX8-NEXT: v_mov_b32_e32 v5, 15
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v1
-; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_and_b32_e32 v4, 15, v2
-; GFX8-NEXT: v_xor_b32_e32 v7, -1, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, v4, v3
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT: v_and_b32_sdwa v4, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v0
+; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v5
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, v3, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, 15
; GFX8-NEXT: v_mov_b32_e32 v5, -1
+; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v5, 1
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -3664,13 +3611,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX6-LABEL: v_fshr_v2i16_4_8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_bfe_u32 v2, v2, 4, 12
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2
+; GFX6-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -3678,14 +3623,12 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, 8
; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 4, v1
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -3723,35 +3666,22 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) {
; GFX6-LABEL: v_fshr_v2i16_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_lshr_b32 s4, s4, 14
-; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
-; GFX6-NEXT: s_or_b32 s0, s0, s4
-; GFX6-NEXT: s_lshl_b32 s2, s2, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_and_b32_e32 v2, 15, v0
; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001
-; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001
-; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshl_b32 s1, s1, 1
-; GFX6-NEXT: s_lshr_b32 s4, s4, 14
-; GFX6-NEXT: s_lshl_b32 s3, s3, 1
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
+; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_and_b32 s0, s2, 0xffff
+; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_and_b32_e32 v2, 15, v1
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
-; GFX6-NEXT: s_or_b32 s1, s1, s4
; GFX6-NEXT: v_and_b32_e32 v1, 15, v1
-; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001
-; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2
-; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT: s_lshl_b32 s0, s1, 1
+; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1
+; GFX6-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -3760,36 +3690,24 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
;
; GFX8-LABEL: v_fshr_v2i16_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s1
+; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_lshr_b32 s4, s4, 15
-; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s4
-; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: v_and_b32_e32 v1, 15, v0
-; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0
-; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s1
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0
-; GFX8-NEXT: s_lshr_b32 s4, s3, 15
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: v_mov_b32_e32 v2, 15
; GFX8-NEXT: v_mov_b32_e32 v3, -1
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s3
-; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_lshl_b32 s0, s2, 1
+; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -3845,33 +3763,20 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: v_fshr_v2i16_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15
-; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_and_b32 s4, s2, 15
+; GFX6-NEXT: s_andn2_b32 s2, 15, s2
; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 14, v2
-; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15
-; GFX6-NEXT: s_or_b32 s2, s3, s2
-; GFX6-NEXT: v_or_b32_e32 v2, s0, v2
-; GFX6-NEXT: s_lshl_b32 s0, s1, 1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3
-; GFX6-NEXT: v_or_b32_e32 v3, s0, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_xor_b32 s0, s2, -1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: s_lshr_b32 s1, s0, 16
-; GFX6-NEXT: s_and_b32 s2, s0, 15
-; GFX6-NEXT: s_andn2_b32 s0, 15, s0
-; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_and_b32 s0, s1, 15
-; GFX6-NEXT: s_andn2_b32 s1, 15, s1
-; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: s_lshl_b32 s0, s0, s2
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s4, v0
+; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_and_b32 s0, s3, 15
+; GFX6-NEXT: s_andn2_b32 s2, 15, s3
+; GFX6-NEXT: s_lshl_b32 s1, s1, 1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: s_lshl_b32 s1, s1, s2
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1
+; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -3881,31 +3786,21 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
; GFX8-LABEL: v_fshr_v2i16_svs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NEXT: s_lshr_b32 s3, s1, 16
+; GFX8-NEXT: s_and_b32 s4, s1, 15
+; GFX8-NEXT: s_andn2_b32 s1, 15, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, 15
+; GFX8-NEXT: s_lshl_b32 s0, s0, s1
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, s4, v0
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
-; GFX8-NEXT: s_lshl_b32 s0, s2, 1
-; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, s0, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 1
-; GFX8-NEXT: s_xor_b32 s0, s1, -1
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_and_b32 s2, s0, 15
-; GFX8-NEXT: s_andn2_b32 s0, 15, s0
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, s0, v3
-; GFX8-NEXT: s_and_b32 s0, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
-; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v2
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1
+; GFX8-NEXT: s_and_b32 s0, s3, 15
+; GFX8-NEXT: s_andn2_b32 s1, 15, s3
+; GFX8-NEXT: s_lshl_b32 s2, s2, 1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_lshl_b32 s1, s2, s1
+; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, s1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: ; return to shader part epilog
@@ -3970,32 +3865,19 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) {
; GFX6-LABEL: v_fshr_v2i16_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX6-NEXT: s_or_b32 s2, s3, s2
-; GFX6-NEXT: s_bfe_u32 s3, s0, 0xf0001
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_lshr_b32 s3, s3, 14
-; GFX6-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: s_lshr_b32 s3, s3, 14
-; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_xor_b32 s2, s2, -1
-; GFX6-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT: s_lshr_b32 s3, s2, 16
; GFX6-NEXT: s_and_b32 s4, s2, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s2
-; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX6-NEXT: s_lshr_b32 s0, s0, s2
-; GFX6-NEXT: s_lshl_b32 s1, s1, 1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX6-NEXT: s_lshr_b32 s0, s0, s4
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s3, 15
; GFX6-NEXT: s_andn2_b32 s2, 15, s3
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001
-; GFX6-NEXT: s_lshr_b32 s0, s0, s2
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1
+; GFX6-NEXT: s_lshr_b32 s0, s1, s0
; GFX6-NEXT: v_or_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -4005,32 +3887,21 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: v_fshr_v2i16_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0
-; GFX8-NEXT: s_lshr_b32 s3, s3, 15
-; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: s_lshr_b32 s3, s2, 15
-; GFX8-NEXT: s_xor_b32 s1, s1, -1
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s4, s1, 15
; GFX8-NEXT: s_andn2_b32 s1, 15, s1
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v1
-; GFX8-NEXT: s_lshr_b32 s0, s0, s1
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, s1, v1
+; GFX8-NEXT: s_lshr_b32 s0, s0, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
; GFX8-NEXT: s_and_b32 s0, s3, 15
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s2
; GFX8-NEXT: s_andn2_b32 s1, 15, s3
-; GFX8-NEXT: s_lshr_b32 s0, s0, 1
-; GFX8-NEXT: s_lshr_b32 s0, s0, s1
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0
+; GFX8-NEXT: s_lshr_b32 s0, s2, s0
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -4091,46 +3962,26 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
; GFX6-LABEL: s_fshr_v3i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX6-NEXT: s_lshl_b32 s7, s7, 16
-; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_and_b32 s7, s8, 0xffff
-; GFX6-NEXT: s_bfe_u32 s8, s3, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_lshr_b32 s8, s8, 14
-; GFX6-NEXT: s_or_b32 s0, s0, s8
-; GFX6-NEXT: s_bfe_u32 s8, s4, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s1, s1, 1
-; GFX6-NEXT: s_lshr_b32 s8, s8, 14
-; GFX6-NEXT: s_lshl_b32 s3, s3, 1
-; GFX6-NEXT: s_xor_b32 s6, s6, -1
-; GFX6-NEXT: s_or_b32 s1, s1, s8
-; GFX6-NEXT: s_lshr_b32 s8, s6, 16
; GFX6-NEXT: s_and_b32 s9, s6, 15
; GFX6-NEXT: s_andn2_b32 s6, 15, s6
-; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s0, s0, s9
-; GFX6-NEXT: s_lshr_b32 s3, s3, s6
-; GFX6-NEXT: s_lshl_b32 s4, s4, 1
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_lshl_b32 s0, s0, s6
+; GFX6-NEXT: s_lshr_b32 s3, s3, s9
; GFX6-NEXT: s_or_b32 s0, s0, s3
-; GFX6-NEXT: s_and_b32 s3, s8, 15
-; GFX6-NEXT: s_andn2_b32 s6, 15, s8
-; GFX6-NEXT: s_lshl_b32 s1, s1, s3
-; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001
-; GFX6-NEXT: s_lshr_b32 s3, s3, s6
-; GFX6-NEXT: s_or_b32 s1, s1, s3
-; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001
+; GFX6-NEXT: s_and_b32 s3, s7, 15
+; GFX6-NEXT: s_andn2_b32 s6, 15, s7
+; GFX6-NEXT: s_lshl_b32 s1, s1, 1
+; GFX6-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX6-NEXT: s_lshl_b32 s1, s1, s6
+; GFX6-NEXT: s_lshr_b32 s3, s4, s3
+; GFX6-NEXT: s_andn2_b32 s4, 15, s8
; GFX6-NEXT: s_lshl_b32 s2, s2, 1
-; GFX6-NEXT: s_lshr_b32 s3, s3, 14
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_lshl_b32 s3, s5, 1
-; GFX6-NEXT: s_xor_b32 s4, s7, -1
-; GFX6-NEXT: s_and_b32 s5, s4, 15
-; GFX6-NEXT: s_andn2_b32 s4, 15, s4
-; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s2, s2, s5
-; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_or_b32 s1, s1, s3
+; GFX6-NEXT: s_and_b32 s3, s8, 15
+; GFX6-NEXT: s_lshl_b32 s2, s2, s4
+; GFX6-NEXT: s_and_b32 s4, s5, 0xffff
+; GFX6-NEXT: s_lshr_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
@@ -4141,43 +3992,26 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshr_v3i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s8, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_lshr_b32 s8, s8, 15
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s0, s0, s8
-; GFX8-NEXT: s_lshl_b32 s6, s6, 1
-; GFX8-NEXT: s_lshr_b32 s8, s7, 15
-; GFX8-NEXT: s_xor_b32 s4, s4, -1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_or_b32 s6, s6, s8
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_lshl_b32 s0, s0, s9
-; GFX8-NEXT: s_lshr_b32 s2, s2, s4
-; GFX8-NEXT: s_lshl_b32 s7, s7, 1
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s4
+; GFX8-NEXT: s_lshr_b32 s2, s2, s9
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s8, 15
-; GFX8-NEXT: s_lshl_b32 s2, s6, s2
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s7
; GFX8-NEXT: s_andn2_b32 s4, 15, s8
-; GFX8-NEXT: s_lshr_b32 s6, s6, 1
-; GFX8-NEXT: s_lshr_b32 s4, s6, s4
-; GFX8-NEXT: s_or_b32 s2, s2, s4
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s6, s6, 1
+; GFX8-NEXT: s_lshl_b32 s4, s6, s4
+; GFX8-NEXT: s_lshr_b32 s2, s7, s2
+; GFX8-NEXT: s_or_b32 s2, s4, s2
+; GFX8-NEXT: s_and_b32 s4, s5, 15
+; GFX8-NEXT: s_andn2_b32 s5, 15, s5
; GFX8-NEXT: s_lshl_b32 s1, s1, 1
-; GFX8-NEXT: s_lshr_b32 s4, s4, 15
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
-; GFX8-NEXT: s_or_b32 s1, s1, s4
-; GFX8-NEXT: s_xor_b32 s4, s5, -1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_and_b32 s5, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
; GFX8-NEXT: s_lshl_b32 s1, s1, s5
; GFX8-NEXT: s_lshr_b32 s3, s3, s4
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -4332,92 +4166,58 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
; GFX6-LABEL: v_fshr_v3i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX6-NEXT: v_bfe_u32 v8, v3, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX6-NEXT: v_bfe_u32 v8, v4, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v8
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GFX6-NEXT: v_and_b32_e32 v9, 15, v6
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v9, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_and_b32_e32 v3, 15, v8
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8
+; GFX6-NEXT: v_and_b32_e32 v3, 15, v7
+; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1
-; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3
-; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 15, v4
-; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v4
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v6, v1
; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 15, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v4
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v3i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT: v_mov_b32_e32 v7, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, 15
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v9
-; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2
-; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_and_b32_e32 v7, 15, v4
-; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4
-; GFX8-NEXT: v_and_b32_e32 v10, 15, v10
-; GFX8-NEXT: v_lshlrev_b16_e32 v6, v7, v6
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v9
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT: v_and_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4
+; GFX8-NEXT: v_and_b32_e32 v6, 15, v4
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0
+; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, v6, v2
+; GFX8-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX8-NEXT: v_mov_b32_e32 v7, 15
; GFX8-NEXT: v_mov_b32_e32 v8, -1
+; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v8, 1
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v3
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v3
-; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5
-; GFX8-NEXT: v_and_b32_e32 v4, 15, v3
-; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3
-; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v2
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -4491,64 +4291,38 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) {
; GFX6-LABEL: s_fshr_v4i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s9, s9, 16
-; GFX6-NEXT: s_and_b32 s8, s8, 0xffff
-; GFX6-NEXT: s_or_b32 s8, s9, s8
-; GFX6-NEXT: s_lshl_b32 s9, s11, 16
-; GFX6-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX6-NEXT: s_or_b32 s9, s9, s10
-; GFX6-NEXT: s_bfe_u32 s10, s4, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s0, s0, 1
-; GFX6-NEXT: s_lshr_b32 s10, s10, 14
-; GFX6-NEXT: s_or_b32 s0, s0, s10
-; GFX6-NEXT: s_bfe_u32 s10, s5, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s1, s1, 1
-; GFX6-NEXT: s_lshr_b32 s10, s10, 14
-; GFX6-NEXT: s_lshl_b32 s4, s4, 1
-; GFX6-NEXT: s_xor_b32 s8, s8, -1
-; GFX6-NEXT: s_or_b32 s1, s1, s10
-; GFX6-NEXT: s_lshr_b32 s10, s8, 16
-; GFX6-NEXT: s_and_b32 s11, s8, 15
+; GFX6-NEXT: s_and_b32 s12, s8, 15
; GFX6-NEXT: s_andn2_b32 s8, 15, s8
-; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s0, s0, s11
-; GFX6-NEXT: s_lshr_b32 s4, s4, s8
-; GFX6-NEXT: s_lshl_b32 s5, s5, 1
+; GFX6-NEXT: s_lshl_b32 s0, s0, 1
+; GFX6-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX6-NEXT: s_lshl_b32 s0, s0, s8
+; GFX6-NEXT: s_lshr_b32 s4, s4, s12
; GFX6-NEXT: s_or_b32 s0, s0, s4
-; GFX6-NEXT: s_and_b32 s4, s10, 15
-; GFX6-NEXT: s_andn2_b32 s8, 15, s10
-; GFX6-NEXT: s_lshl_b32 s1, s1, s4
-; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001
-; GFX6-NEXT: s_lshr_b32 s4, s4, s8
+; GFX6-NEXT: s_and_b32 s4, s9, 15
+; GFX6-NEXT: s_andn2_b32 s8, 15, s9
+; GFX6-NEXT: s_lshl_b32 s1, s1, 1
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_lshl_b32 s1, s1, s8
+; GFX6-NEXT: s_lshr_b32 s4, s5, s4
; GFX6-NEXT: s_or_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_andn2_b32 s4, 15, s10
+; GFX6-NEXT: s_lshl_b32 s2, s2, 1
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s2, 1
-; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001
-; GFX6-NEXT: s_lshr_b32 s2, s2, 14
-; GFX6-NEXT: s_or_b32 s1, s1, s2
-; GFX6-NEXT: s_lshl_b32 s2, s3, 1
-; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001
-; GFX6-NEXT: s_lshr_b32 s3, s3, 14
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_lshl_b32 s3, s6, 1
-; GFX6-NEXT: s_xor_b32 s5, s9, -1
-; GFX6-NEXT: s_lshl_b32 s4, s7, 1
-; GFX6-NEXT: s_lshr_b32 s6, s5, 16
-; GFX6-NEXT: s_and_b32 s7, s5, 15
-; GFX6-NEXT: s_andn2_b32 s5, 15, s5
-; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
-; GFX6-NEXT: s_lshl_b32 s1, s1, s7
-; GFX6-NEXT: s_lshr_b32 s3, s3, s5
-; GFX6-NEXT: s_or_b32 s1, s1, s3
-; GFX6-NEXT: s_and_b32 s3, s6, 15
-; GFX6-NEXT: s_andn2_b32 s5, 15, s6
-; GFX6-NEXT: s_lshl_b32 s2, s2, s3
-; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001
-; GFX6-NEXT: s_lshr_b32 s3, s3, s5
-; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_and_b32 s1, s10, 15
+; GFX6-NEXT: s_lshl_b32 s2, s2, s4
+; GFX6-NEXT: s_and_b32 s4, s6, 0xffff
+; GFX6-NEXT: s_lshr_b32 s1, s4, s1
+; GFX6-NEXT: s_andn2_b32 s4, 15, s11
+; GFX6-NEXT: s_lshl_b32 s3, s3, 1
+; GFX6-NEXT: s_or_b32 s1, s2, s1
+; GFX6-NEXT: s_and_b32 s2, s11, 15
+; GFX6-NEXT: s_lshl_b32 s3, s3, s4
+; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT: s_lshr_b32 s2, s4, s2
+; GFX6-NEXT: s_or_b32 s2, s3, s2
; GFX6-NEXT: s_and_b32 s2, 0xffff, s2
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
@@ -4557,63 +4331,41 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
;
; GFX8-LABEL: s_fshr_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s8, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
-; GFX8-NEXT: s_lshr_b32 s8, s8, 15
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s0, s0, s8
-; GFX8-NEXT: s_lshl_b32 s6, s6, 1
-; GFX8-NEXT: s_lshr_b32 s8, s7, 15
-; GFX8-NEXT: s_xor_b32 s4, s4, -1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_or_b32 s6, s6, s8
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
; GFX8-NEXT: s_andn2_b32 s4, 15, s4
-; GFX8-NEXT: s_lshr_b32 s2, s2, 1
-; GFX8-NEXT: s_lshl_b32 s0, s0, s9
-; GFX8-NEXT: s_lshr_b32 s2, s2, s4
-; GFX8-NEXT: s_lshl_b32 s7, s7, 1
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s4
+; GFX8-NEXT: s_lshr_b32 s2, s2, s9
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s8, 15
-; GFX8-NEXT: s_lshl_b32 s2, s6, s2
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s7
; GFX8-NEXT: s_andn2_b32 s4, 15, s8
-; GFX8-NEXT: s_lshr_b32 s6, s6, 1
-; GFX8-NEXT: s_lshr_b32 s4, s6, s4
-; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_lshl_b32 s6, s6, 1
+; GFX8-NEXT: s_lshl_b32 s4, s6, s4
+; GFX8-NEXT: s_lshr_b32 s2, s7, s2
+; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s3
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_lshr_b32 s2, s1, 16
; GFX8-NEXT: s_lshr_b32 s4, s3, 16
-; GFX8-NEXT: s_lshl_b32 s1, s1, 1
-; GFX8-NEXT: s_lshr_b32 s6, s6, 15
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
-; GFX8-NEXT: s_or_b32 s1, s1, s6
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
-; GFX8-NEXT: s_lshr_b32 s6, s4, 15
-; GFX8-NEXT: s_xor_b32 s5, s5, -1
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_or_b32 s2, s2, s6
; GFX8-NEXT: s_lshr_b32 s6, s5, 16
; GFX8-NEXT: s_and_b32 s7, s5, 15
; GFX8-NEXT: s_andn2_b32 s5, 15, s5
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_lshl_b32 s1, s1, s7
-; GFX8-NEXT: s_lshr_b32 s3, s3, s5
-; GFX8-NEXT: s_lshl_b32 s4, s4, 1
+; GFX8-NEXT: s_lshl_b32 s1, s1, 1
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, s5
+; GFX8-NEXT: s_lshr_b32 s3, s3, s7
; GFX8-NEXT: s_or_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s3, s6, 15
-; GFX8-NEXT: s_lshl_b32 s2, s2, s3
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
; GFX8-NEXT: s_andn2_b32 s5, 15, s6
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_lshr_b32 s3, s3, s5
+; GFX8-NEXT: s_lshl_b32 s2, s2, 1
+; GFX8-NEXT: s_lshl_b32 s2, s2, s5
+; GFX8-NEXT: s_lshr_b32 s3, s4, s3
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
@@ -4749,120 +4501,76 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
; GFX6-LABEL: v_fshr_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX6-NEXT: v_or_b32_e32 v8, v9, v8
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX6-NEXT: v_or_b32_e32 v9, v9, v10
-; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v10
-; GFX6-NEXT: v_bfe_u32 v10, v5, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10
-; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v10
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
-; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8
-; GFX6-NEXT: v_and_b32_e32 v11, 15, v8
+; GFX6-NEXT: v_and_b32_e32 v12, 15, v8
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v12, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 15, v10
-; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v9
+; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX6-NEXT: v_and_b32_e32 v8, 15, v8
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1
-; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v8, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 15, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v10
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11
+; GFX6-NEXT: v_and_b32_e32 v5, 15, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_and_b32_e32 v8, 15, v6
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
-; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4
; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 15, v7
-; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7
-; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3
-; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v11
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v7
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v5
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT: v_mov_b32_e32 v7, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, 15
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v9
-; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2
-; GFX8-NEXT: v_xor_b32_e32 v11, -1, v4
-; GFX8-NEXT: v_and_b32_e32 v10, 15, v4
-; GFX8-NEXT: v_and_b32_e32 v11, 15, v11
-; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9
-; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6
-; GFX8-NEXT: v_lshrrev_b16_e32 v9, v11, v9
-; GFX8-NEXT: v_mov_b32_e32 v10, -1
-; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX8-NEXT: v_and_b32_sdwa v9, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4
+; GFX8-NEXT: v_and_b32_e32 v6, 15, v4
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0
+; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8
+; GFX8-NEXT: v_lshrrev_b16_e32 v6, v6, v2
+; GFX8-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX8-NEXT: v_mov_b32_e32 v7, 15
+; GFX8-NEXT: v_mov_b32_e32 v9, -1
+; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v10, 1
; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v9, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5
+; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5
; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v6, 15, v5
-; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v4
-; GFX8-NEXT: v_lshlrev_b16_e32 v2, v6, v2
-; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT: v_and_b32_sdwa v4, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v6
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT: v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1
+; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -5052,8 +4760,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT: v_not_b32_e32 v5, v4
-; GFX6-NEXT: v_and_b32_e32 v5, 63, v5
+; GFX6-NEXT: v_bfi_b32 v5, v4, 0, 63
; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
@@ -5065,8 +4772,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT: v_not_b32_e32 v5, v4
-; GFX8-NEXT: v_and_b32_e32 v5, 63, v5
+; GFX8-NEXT: v_bfi_b32 v5, v4, 0, 63
; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5078,8 +4784,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: v_not_b32_e32 v5, v4
-; GFX9-NEXT: v_and_b32_e32 v5, 63, v5
+; GFX9-NEXT: v_bfi_b32 v5, v4, 0, 63
; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5090,12 +4795,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX10-LABEL: v_fshr_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v5, v4
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX10-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -5103,16 +4807,14 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX11-LABEL: v_fshr_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v5, v4
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v5, 63, v5
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT: v_and_b32_e32 v5, 63, v4
+; GFX11-NEXT: v_bfi_b32 v4, v4, 0, 63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5228,9 +4930,8 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) {
define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) {
; GFX6-LABEL: v_fshr_i64_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_not_b32_e32 v1, v0
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX6-NEXT: v_bfi_b32 v1, v0, 0, 63
; GFX6-NEXT: v_and_b32_e32 v0, 63, v0
; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1
; GFX6-NEXT: v_lshr_b64 v[3:4], s[2:3], v0
@@ -5240,9 +4941,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX8-LABEL: v_fshr_i64_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_not_b32_e32 v1, v0
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX8-NEXT: v_bfi_b32 v1, v0, 0, 63
; GFX8-NEXT: v_and_b32_e32 v0, 63, v0
; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3]
@@ -5252,9 +4952,8 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX9-LABEL: v_fshr_i64_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_not_b32_e32 v1, v0
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX9-NEXT: v_bfi_b32 v1, v0, 0, 63
; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3]
@@ -5264,29 +4963,27 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
;
; GFX10-LABEL: v_fshr_i64_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_not_b32_e32 v1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX10-NEXT: v_bfi_b32 v1, v0, 0, 63
+; GFX10-NEXT: v_and_b32_e32 v2, 63, v0
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1]
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshr_i64_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_not_b32_e32 v1, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX11-NEXT: v_bfi_b32 v1, v0, 0, 63
+; GFX11-NEXT: v_and_b32_e32 v2, 63, v0
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v2, 63, v1
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
%cast = bitcast i64 %result to <2 x float>
@@ -5492,15 +5189,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT: v_not_b32_e32 v9, v8
-; GFX6-NEXT: v_and_b32_e32 v9, 63, v9
+; GFX6-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v10
-; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
; GFX6-NEXT: v_and_b32_e32 v4, 63, v10
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4
@@ -5513,15 +5208,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT: v_not_b32_e32 v9, v8
-; GFX8-NEXT: v_and_b32_e32 v9, 63, v9
+; GFX8-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v4, v10
-; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX8-NEXT: v_and_b32_e32 v4, 63, v10
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
@@ -5534,15 +5227,13 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: v_not_b32_e32 v9, v8
-; GFX9-NEXT: v_and_b32_e32 v9, 63, v9
+; GFX9-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX9-NEXT: v_not_b32_e32 v4, v10
-; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_bfi_b32 v4, v10, 0, 63
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX9-NEXT: v_and_b32_e32 v4, 63, v10
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
@@ -5554,16 +5245,14 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX10-LABEL: v_fshr_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v9, v8
-; GFX10-NEXT: v_not_b32_e32 v11, v10
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
+; GFX10-NEXT: v_bfi_b32 v11, v10, 0, 63
; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
-; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
@@ -5575,17 +5264,15 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX11-LABEL: v_fshr_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v9, v8
-; GFX11-NEXT: v_not_b32_e32 v11, v10
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT: v_bfi_b32 v9, v8, 0, 63
; GFX11-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX11-NEXT: v_and_b32_e32 v9, 63, v9
-; GFX11-NEXT: v_and_b32_e32 v11, 63, v11
+; GFX11-NEXT: v_bfi_b32 v11, v10, 0, 63
; GFX11-NEXT: v_and_b32_e32 v10, 63, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
@@ -5848,8 +5535,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-NEXT: v_lshl_b64 v[9:10], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v8
-; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f
+; GFX6-NEXT: v_bfi_b32 v15, v8, 0, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15
; GFX6-NEXT: v_not_b32_e32 v16, 63
; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0
@@ -5897,8 +5584,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX8-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v8
-; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x7f
+; GFX8-NEXT: v_bfi_b32 v15, v8, 0, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15
; GFX8-NEXT: v_not_b32_e32 v16, 63
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10]
@@ -5946,8 +5633,8 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX9-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v8
-; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7f
+; GFX9-NEXT: v_bfi_b32 v15, v8, 0, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10]
; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
@@ -5990,107 +5677,103 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10-LABEL: v_fshr_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v9, v8
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1
+; GFX10-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 31, v1
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: v_and_b32_e32 v21, 0x7f, v8
-; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v9
-; GFX10-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v21
-; GFX10-NEXT: v_sub_nc_u32_e32 v12, 64, v20
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1]
-; GFX10-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21
-; GFX10-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8
+; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18
+; GFX10-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
+; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19
+; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX10-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5]
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20
-; GFX10-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7]
-; GFX10-NEXT: v_or_b32_e32 v10, v12, v10
-; GFX10-NEXT: v_or_b32_e32 v11, v13, v11
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v21
-; GFX10-NEXT: v_or_b32_e32 v12, v15, v17
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v19
+; GFX10-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
+; GFX10-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v18
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
; GFX10-NEXT: v_or_b32_e32 v0, v14, v16
+; GFX10-NEXT: v_or_b32_e32 v10, v15, v17
; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v21
-; GFX10-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v18, v0, s5
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v19, v12, s5
-; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v20, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v0, s5
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v4, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s6
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s6
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s5
-; GFX10-NEXT: v_or_b32_e32 v0, v8, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s5
+; GFX10-NEXT: v_or_b32_e32 v0, v12, v4
; GFX10-NEXT: v_or_b32_e32 v1, v7, v5
; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX10-NEXT: v_or_b32_e32 v3, v3, v9
+; GFX10-NEXT: v_or_b32_e32 v3, v3, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fshr_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v9, v8
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1
+; GFX11-NEXT: v_bfi_b32 v18, v8, 0, 0x7f
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 31, v1
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v9
-; GFX11-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_sub_nc_u32_e32 v12, 64, v20
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v20
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v20, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b64 v[12:13], v12, v[0:1]
-; GFX11-NEXT: v_or_b32_e32 v10, v12, v10
-; GFX11-NEXT: v_and_b32_e32 v21, 0x7f, v8
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v20, v[0:1]
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v20
-; GFX11-NEXT: v_or_b32_e32 v11, v13, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
-; GFX11-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo
-; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 0xffffffc0, v21
-; GFX11-NEXT: v_lshrrev_b64 v[14:15], v21, v[4:5]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v21
-; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v9
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3]
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v19
; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7]
-; GFX11-NEXT: v_lshrrev_b64 v[18:19], v18, v[6:7]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v20
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v21
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v18
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v18
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
+; GFX11-NEXT: v_cndmask_b32_e32 v20, v0, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshrrev_b64 v[8:9], v10, v[6:7]
; GFX11-NEXT: v_or_b32_e32 v0, v14, v16
-; GFX11-NEXT: v_or_b32_e32 v12, v15, v17
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v10, v15, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v20, v2, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7]
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v10, s1
+; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v18, v0, s1
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v21, v[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v19, v12, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v13, v4, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s2
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
-; GFX11-NEXT: v_or_b32_e32 v0, v8, v4
+; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s1
+; GFX11-NEXT: v_or_b32_e32 v0, v12, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v1, v7, v5
; GFX11-NEXT: v_or_b32_e32 v2, v2, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v3, v3, v9
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
ret i128 %result
@@ -6099,12 +5782,12 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
; GFX6-LABEL: v_fshr_i128_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_not_b32_e32 v1, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f
; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
; GFX6-NEXT: s_mov_b32 s1, 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX6-NEXT: v_bfi_b32 v7, v0, 0, v1
; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7
; GFX6-NEXT: v_not_b32_e32 v8, 63
@@ -6152,12 +5835,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
;
; GFX8-LABEL: v_fshr_i128_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_not_b32_e32 v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7f
; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
; GFX8-NEXT: s_mov_b32 s1, 0
-; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX8-NEXT: v_bfi_b32 v7, v0, 0, v1
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7
; GFX8-NEXT: v_not_b32_e32 v8, 63
@@ -6205,12 +5888,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
;
; GFX9-LABEL: v_fshr_i128_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_not_b32_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f
; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX9-NEXT: v_bfi_b32 v7, v0, 0, v1
; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7
; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9]
@@ -6257,101 +5940,99 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
;
; GFX10-LABEL: v_fshr_i128_ssv:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_not_b32_e32 v1, v0
-; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_bfi_b32 v11, v0, 0, 0x7f
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX10-NEXT: s_lshr_b32 s8, s1, 31
-; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0
-; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1
-; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v11
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v13
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11]
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11]
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5]
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7]
-; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11]
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX10-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX10-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13
-; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v11
+; GFX10-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX10-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v12
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX10-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12
+; GFX10-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX10-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX10-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX10-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX10-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX10-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT: v_or_b32_e32 v0, v7, v9
+; GFX10-NEXT: v_or_b32_e32 v7, v8, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
+; GFX10-NEXT: v_or_b32_e32 v0, v5, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX10-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshr_i128_ssv:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_not_b32_e32 v1, v0
-; GFX11-NEXT: s_mov_b32 s9, 0
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT: v_bfi_b32 v11, v0, 0, 0x7f
; GFX11-NEXT: s_lshr_b32 s8, s1, 31
-; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0
-; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v1
-; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT: s_mov_b32 s9, 0
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v11
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v11, s[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0
; GFX11-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v13
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffc0, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[10:11]
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], v13, s[4:5]
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[6:7]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v6, s[10:11]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v13
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v13
-; GFX11-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v1
-; GFX11-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX11-NEXT: v_or_b32_e32 v8, v9, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[10:11]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v12
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX11-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX11-NEXT: v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11
+; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v12
+; GFX11-NEXT: v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v12
+; GFX11-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0xffffffc0, v12
+; GFX11-NEXT: v_lshlrev_b64 v[9:10], v9, s[6:7]
+; GFX11-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v12
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v11
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v0, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s1
-; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s8, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s9, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX11-NEXT: v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v7, v9
+; GFX11-NEXT: v_or_b32_e32 v7, v8, v10
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v12, s[6:7]
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v14, s8, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s9, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s5, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s1
+; GFX11-NEXT: v_or_b32_e32 v0, v5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT: v_or_b32_e32 v1, v6, v3
+; GFX11-NEXT: v_or_b32_e32 v2, v7, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v9
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@@ -7486,226 +7167,224 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1
+; GFX6-NEXT: v_mov_b32_e32 v18, 0x7f
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GFX6-NEXT: v_bfi_b32 v19, v16, 0, v18
; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v16
-; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0
-; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19
-; GFX6-NEXT: v_and_b32_e32 v25, 0x7f, v16
-; GFX6-NEXT: v_or_b32_e32 v23, v0, v21
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v25
-; GFX6-NEXT: v_or_b32_e32 v24, v1, v22
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[10:11], v0
-; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25
-; GFX6-NEXT: v_not_b32_e32 v26, 63
-; GFX6-NEXT: v_or_b32_e32 v21, v21, v0
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v19, v26
-; GFX6-NEXT: v_or_b32_e32 v22, v22, v1
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v0
+; GFX6-NEXT: v_not_b32_e32 v17, 63
+; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19
+; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v17
+; GFX6-NEXT: v_lshr_b64 v[23:24], v[21:22], v23
+; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[21:22], v19
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[21:22], v27
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
-; GFX6-NEXT: v_add_i32_e64 v0, s[4:5], v25, v26
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0
-; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
-; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX6-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX6-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v16
+; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc
+; GFX6-NEXT: v_add_i32_e32 v16, vcc, v2, v17
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v2
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[8:9], v2
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[10:11], v21
; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5]
-; GFX6-NEXT: v_or_b32_e32 v0, v16, v8
-; GFX6-NEXT: v_or_b32_e32 v1, v17, v9
+; GFX6-NEXT: v_or_b32_e32 v21, v2, v21
+; GFX6-NEXT: v_or_b32_e32 v22, v3, v22
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v16
+; GFX6-NEXT: v_bfi_b32 v16, v20, 0, v18
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; GFX6-NEXT: v_or_b32_e32 v0, v24, v2
+; GFX6-NEXT: v_or_b32_e32 v1, v25, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v19, v8
+; GFX6-NEXT: v_or_b32_e32 v3, v23, v9
; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX6-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v20
-; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16
-; GFX6-NEXT: v_add_i32_e32 v17, vcc, v16, v26
-; GFX6-NEXT: v_or_b32_e32 v10, v4, v10
-; GFX6-NEXT: v_or_b32_e32 v11, v5, v11
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16
+; GFX6-NEXT: v_add_i32_e32 v21, vcc, v16, v17
+; GFX6-NEXT: v_lshr_b64 v[10:11], v[8:9], v10
+; GFX6-NEXT: v_lshl_b64 v[18:19], v[6:7], v16
; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16
-; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v17
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v21
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v18
-; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v18
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc
+; GFX6-NEXT: v_or_b32_e32 v11, v11, v19
+; GFX6-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5]
+; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v20
+; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v20
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc
-; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v10
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v10
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6
-; GFX6-NEXT: v_add_i32_e32 v11, vcc, v10, v26
-; GFX6-NEXT: v_or_b32_e32 v16, v4, v6
-; GFX6-NEXT: v_or_b32_e32 v19, v5, v7
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v11
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v10
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX6-NEXT: v_add_i32_e32 v17, vcc, v6, v17
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v6
+; GFX6-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5]
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v6
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], v6
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[14:15], v8
+; GFX6-NEXT: v_or_b32_e32 v8, v6, v8
+; GFX6-NEXT: v_or_b32_e32 v9, v7, v9
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v17
+; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
-; GFX6-NEXT: v_or_b32_e32 v4, v17, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT: v_or_b32_e32 v4, v16, v6
; GFX6-NEXT: v_or_b32_e32 v5, v18, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v8, v10
-; GFX6-NEXT: v_or_b32_e32 v7, v9, v11
+; GFX6-NEXT: v_or_b32_e32 v6, v10, v8
+; GFX6-NEXT: v_or_b32_e32 v7, v11, v9
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v18, 0x7f
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GFX8-NEXT: v_bfi_b32 v19, v16, 0, v18
; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v16
-; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
-; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX8-NEXT: v_and_b32_e32 v25, 0x7f, v16
-; GFX8-NEXT: v_or_b32_e32 v23, v0, v21
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v25
-; GFX8-NEXT: v_or_b32_e32 v24, v1, v22
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11]
-; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9]
-; GFX8-NEXT: v_not_b32_e32 v26, 63
-; GFX8-NEXT: v_or_b32_e32 v21, v21, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v26
-; GFX8-NEXT: v_or_b32_e32 v22, v22, v1
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18]
+; GFX8-NEXT: v_not_b32_e32 v17, 63
+; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v17
+; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[21:22]
+; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v19, v[21:22]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v27, v[21:22]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
-; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v25, v26
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18]
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
-; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
-; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX8-NEXT: v_or_b32_e32 v19, v23, v25
+; GFX8-NEXT: v_or_b32_e32 v23, v24, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5]
+; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, v2, v17
+; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v2, v[10:11]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[10:11]
; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v1, s[4:5]
-; GFX8-NEXT: v_or_b32_e32 v0, v16, v8
-; GFX8-NEXT: v_or_b32_e32 v1, v17, v9
+; GFX8-NEXT: v_or_b32_e32 v21, v2, v21
+; GFX8-NEXT: v_or_b32_e32 v22, v3, v22
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[10:11]
+; GFX8-NEXT: v_bfi_b32 v16, v20, 0, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; GFX8-NEXT: v_or_b32_e32 v0, v24, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v25, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v8
+; GFX8-NEXT: v_or_b32_e32 v3, v23, v9
; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT: v_not_b32_e32 v4, v20
-; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v4
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v16
-; GFX8-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7]
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, v16, v26
-; GFX8-NEXT: v_or_b32_e32 v10, v4, v10
-; GFX8-NEXT: v_or_b32_e32 v11, v5, v11
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v16, v17
+; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[18:19], v16, v[6:7]
; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v21, v[8:9]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
-; GFX8-NEXT: v_or_b32_e32 v3, v3, v18
-; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v18
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v19
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5]
+; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v20
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v10
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13]
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, v10, v26
-; GFX8-NEXT: v_or_b32_e32 v16, v4, v6
-; GFX8-NEXT: v_or_b32_e32 v19, v5, v7
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, v6, v17
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, v[14:15]
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[12:13]
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[14:15]
+; GFX8-NEXT: v_or_b32_e32 v8, v6, v8
+; GFX8-NEXT: v_or_b32_e32 v9, v7, v9
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v17, v[14:15]
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
-; GFX8-NEXT: v_or_b32_e32 v4, v17, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT: v_or_b32_e32 v4, v16, v6
; GFX8-NEXT: v_or_b32_e32 v5, v18, v7
-; GFX8-NEXT: v_or_b32_e32 v6, v8, v10
-; GFX8-NEXT: v_or_b32_e32 v7, v9, v11
+; GFX8-NEXT: v_or_b32_e32 v6, v10, v8
+; GFX8-NEXT: v_or_b32_e32 v7, v11, v9
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v19, 0x7f
; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GFX9-NEXT: v_bfi_b32 v23, v16, 0, v19
; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v16
-; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19
+; GFX9-NEXT: v_sub_u32_e32 v0, 64, v23
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
-; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
-; GFX9-NEXT: v_and_b32_e32 v25, 0x7f, v16
-; GFX9-NEXT: v_or_b32_e32 v23, v0, v21
-; GFX9-NEXT: v_sub_u32_e32 v0, 64, v25
-; GFX9-NEXT: v_or_b32_e32 v24, v1, v22
+; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3]
+; GFX9-NEXT: v_and_b32_e32 v26, 0x7f, v16
+; GFX9-NEXT: v_or_b32_e32 v24, v0, v21
+; GFX9-NEXT: v_sub_u32_e32 v0, 64, v26
+; GFX9-NEXT: v_or_b32_e32 v25, v1, v22
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11]
-; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT: v_lshrrev_b64 v[21:22], v26, v[8:9]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
; GFX9-NEXT: v_or_b32_e32 v21, v21, v0
-; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v19
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v23
; GFX9-NEXT: v_or_b32_e32 v22, v22, v1
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v23, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v24, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v24, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v25, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5]
-; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25
-; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18]
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v26
+; GFX9-NEXT: v_lshlrev_b64 v[16:17], v23, v[17:18]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5]
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v21, v1, v22, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v21, v9, vcc
; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
@@ -7713,9 +7392,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_or_b32_e32 v1, v17, v9
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
+; GFX9-NEXT: v_bfi_b32 v16, v20, 0, v19
; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT: v_not_b32_e32 v4, v20
-; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v4
; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16
; GFX9-NEXT: v_or_b32_e32 v2, v2, v10
; GFX9-NEXT: v_or_b32_e32 v3, v3, v11
@@ -7760,14 +7438,12 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-LABEL: v_fshr_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v17, v16
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17
+; GFX10-NEXT: v_bfi_b32 v25, v16, 0, 0x7f
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26
+; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25
; GFX10-NEXT: v_or_b32_e32 v2, v2, v17
; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25
@@ -7776,54 +7452,54 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX10-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1]
; GFX10-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1]
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25
+; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26
; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v25
; GFX10-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v22, v18, v22
; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v26
; GFX10-NEXT: v_or_b32_e32 v21, v17, v21
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9]
+; GFX10-NEXT: v_bfi_b32 v25, v20, 0, 0x7f
; GFX10-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26
; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v3, s4
+; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20
; GFX10-NEXT: v_or_b32_e32 v16, v16, v18
; GFX10-NEXT: v_cndmask_b32_e64 v21, v21, v2, s4
; GFX10-NEXT: v_or_b32_e32 v17, v17, v19
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v26
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX10-NEXT: v_not_b32_e32 v16, v20
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v5
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4
-; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16
+; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25
; GFX10-NEXT: v_or_b32_e32 v6, v6, v10
-; GFX10-NEXT: v_and_b32_e32 v20, 0x7f, v20
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s4
; GFX10-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo
-; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v25
; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
-; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20
-; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7]
+; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX10-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
; GFX10-NEXT: v_or_b32_e32 v0, v23, v0
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
+; GFX10-NEXT: v_or_b32_e32 v8, v2, v8
; GFX10-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13]
-; GFX10-NEXT: v_or_b32_e32 v8, v2, v8
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15]
; GFX10-NEXT: v_or_b32_e32 v2, v21, v26
; GFX10-NEXT: v_or_b32_e32 v9, v3, v9
-; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20
; GFX10-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo
; GFX10-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15]
+; GFX10-NEXT: v_cmp_gt_u32_e64 s5, 64, v20
; GFX10-NEXT: v_or_b32_e32 v8, v16, v18
; GFX10-NEXT: v_or_b32_e32 v16, v17, v19
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
@@ -7851,99 +7527,95 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX11-LABEL: v_fshr_v2i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v17, v16
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v17
+; GFX11-NEXT: v_bfi_b32 v25, v16, 0, 0x7f
; GFX11-NEXT: v_lshrrev_b32_e32 v17, 31, v1
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v2, v2, v17
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
; GFX11-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1]
; GFX11-NEXT: v_and_b32_e32 v26, 0x7f, v16
; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v24, 0, v24 :: v_dual_add_nc_u32 v19, 0xffffffc0, v25
-; GFX11-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25
+; GFX11-NEXT: v_dual_cndmask_b32 v23, 0, v23 :: v_dual_cndmask_b32 v24, 0, v24
+; GFX11-NEXT: v_bfi_b32 v25, v20, 0, 0x7f
; GFX11-NEXT: v_or_b32_e32 v22, v18, v22
; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26
; GFX11-NEXT: v_or_b32_e32 v21, v17, v21
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1]
; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_and_b32_e32 v20, 0x7f, v20
; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_cndmask_b32 v21, v0, v21 :: v_dual_cndmask_b32 v22, v1, v22
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v26
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_or_b32_e32 v16, v16, v18
; GFX11-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25
-; GFX11-NEXT: v_or_b32_e32 v17, v17, v19
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v2, s0
+; GFX11-NEXT: v_or_b32_e32 v17, v17, v19
; GFX11-NEXT: v_cndmask_b32_e64 v22, v22, v3, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11]
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11]
; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v5
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
-; GFX11-NEXT: v_not_b32_e32 v16, v20
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v26
+; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v16 :: v_dual_cndmask_b32 v1, v1, v17
+; GFX11-NEXT: v_cndmask_b32_e32 v26, 0, v2, vcc_lo
; GFX11-NEXT: v_or_b32_e32 v6, v6, v10
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_and_b32 v20, 0x7f, v20
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_dual_cndmask_b32 v26, 0, v2 :: v_dual_and_b32 v25, 0x7f, v16
; GFX11-NEXT: v_cndmask_b32_e32 v27, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s0
+; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v20
+; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[4:5]
; GFX11-NEXT: v_or_b32_e32 v0, v23, v0
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5]
+; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7]
; GFX11-NEXT: v_add_nc_u32_e32 v23, 0xffffffc0, v20
-; GFX11-NEXT: v_cmp_gt_u32_e64 s1, 64, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo
-; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[14:15]
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v25
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v20
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v8, v[4:5]
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v25, v[6:7]
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
-; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13]
-; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v1, v24, v1
; GFX11-NEXT: v_or_b32_e32 v8, v2, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v25
; GFX11-NEXT: v_or_b32_e32 v2, v21, v26
; GFX11-NEXT: v_or_b32_e32 v9, v3, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc_lo
+; GFX11-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; GFX11-NEXT: v_lshrrev_b64 v[16:17], v20, v[12:13]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v21, v4, v8, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15]
; GFX11-NEXT: v_or_b32_e32 v8, v16, v18
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v16, v17, v19
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v6, v21, v6, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v8, s1
; GFX11-NEXT: v_lshrrev_b64 v[8:9], v20, v[14:15]
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1
; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, v12, s2
; GFX11-NEXT: v_or_b32_e32 v3, v22, v27
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v12, v4, v13, s2
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v8, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v9, s1
; GFX11-NEXT: v_or_b32_e32 v4, v10, v5
-; GFX11-NEXT: v_or_b32_e32 v5, v11, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v5, v11, v12
; GFX11-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v7, v7, v9
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll
index 3390ad2cf2a0..ab71f1f44b2c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.illegal.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -global-isel -global-isel-abort=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not llc -global-isel -global-isel-abort=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
; FIXME: Should produce context error for each one
; ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(p5) = G_GLOBAL_VALUE @external_private (in function: fn_external_private)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index cae833b0d64e..0e1bbbd1ea92 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -123,9 +123,8 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, s1, v2
@@ -143,11 +142,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -302,9 +300,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -319,9 +316,8 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_and_b32 s1, s4, 0xffff
; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -393,9 +389,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -410,9 +405,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -482,12 +476,11 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
-; GFX8-NEXT: v_not_b32_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, v0, v1
+; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -505,11 +498,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -576,10 +568,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -597,11 +588,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -668,10 +658,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -689,11 +678,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -820,19 +808,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-LABEL: insertelement_v_v4i16_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_and_b32 s1, s3, 1
; GFX8-NEXT: s_lshr_b32 s0, s3, 1
+; GFX8-NEXT: s_and_b32 s1, s3, 1
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 4
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX8-NEXT: s_lshl_b32 s2, s2, s1
; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s1, v4
+; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4
; GFX8-NEXT: v_or_b32_e32 v4, s2, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
@@ -846,19 +833,18 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s3, 1
; GFX7-NEXT: s_lshr_b32 s0, s3, 1
+; GFX7-NEXT: s_and_b32 s1, s3, 1
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 4
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: s_lshl_b32 s2, s2, s1
; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v2, s2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1090,8 +1076,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1117,8 +1102,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -1228,8 +1212,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1246,17 +1229,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: v_mov_b32_e32 v4, s1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -1356,16 +1338,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1382,16 +1363,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: s_and_b32 s0, s2, 0xffff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1479,15 +1459,14 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: s_lshr_b32 s0, s2, 1
; GFX8-NEXT: s_lshl_b32 s1, s1, 4
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, s1, v5
+; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1501,19 +1480,18 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s2, 1
; GFX7-NEXT: s_lshr_b32 s0, s2, 1
+; GFX7-NEXT: s_and_b32 s1, s2, 1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s1, s1, 4
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2
; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1601,16 +1579,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT: v_not_b32_e32 v3, v3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1627,16 +1604,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -1910,14 +1886,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-LABEL: insertelement_v_v8i16_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s3, 1
; GFX8-NEXT: s_lshr_b32 s4, s3, 1
+; GFX8-NEXT: s_and_b32 s0, s3, 1
; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshl_b32 s5, s1, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s6, s0
+; GFX8-NEXT: s_lshl_b32 s5, s1, s0
+; GFX8-NEXT: s_lshl_b32 s6, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
@@ -1926,7 +1901,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6
; GFX8-NEXT: v_or_b32_e32 v6, s5, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
@@ -1942,14 +1917,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s3, 1
; GFX7-NEXT: s_lshr_b32 s4, s3, 1
+; GFX7-NEXT: s_and_b32 s0, s3, 1
; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshl_b32 s5, s1, s0
-; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s6, s0
+; GFX7-NEXT: s_lshl_b32 s5, s1, s0
+; GFX7-NEXT: s_lshl_b32 s6, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -1958,7 +1932,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4
; GFX7-NEXT: v_or_b32_e32 v4, s5, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
@@ -2263,17 +2237,16 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
@@ -2294,23 +2267,22 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
; GFX7-NEXT: v_mov_b32_e32 v5, s11
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
@@ -2441,23 +2413,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2478,23 +2449,22 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -2628,7 +2598,6 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v7, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v8, 0
@@ -2636,7 +2605,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
@@ -2658,9 +2627,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -2668,7 +2636,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v7, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7
; GFX7-NEXT: v_or_b32_e32 v7, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
@@ -2773,13 +2741,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-LABEL: insertelement_v_v8i16_v_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s2, 1
; GFX8-NEXT: s_lshr_b32 s4, s2, 1
+; GFX8-NEXT: s_and_b32 s0, s2, 1
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s5, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
@@ -2789,7 +2756,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
@@ -2805,14 +2772,13 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s2, 1
; GFX7-NEXT: s_lshr_b32 s4, s2, 1
+; GFX7-NEXT: s_and_b32 s0, s2, 1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
-; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s5, s0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -2821,7 +2787,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
@@ -2935,7 +2901,6 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v9, 0
@@ -2943,7 +2908,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v3, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -2959,15 +2924,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -2975,7 +2939,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -3283,19 +3247,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX8-NEXT: s_and_b32 s0, s3, 1
+; GFX8-NEXT: s_lshr_b32 m0, s3, 1
; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshr_b32 m0, s3, 1
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_mov_b32_e32 v9, 0
; GFX8-NEXT: v_mov_b32_e32 v10, 16
; GFX8-NEXT: v_mov_b32_e32 v11, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_movrels_b32_e32 v12, v0
-; GFX8-NEXT: v_and_b32_e32 v12, s0, v12
+; GFX8-NEXT: v_bfi_b32 v12, s0, 0, v12
; GFX8-NEXT: v_or_b32_e32 v12, s1, v12
; GFX8-NEXT: v_movreld_b32_e32 v0, v12
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -3310,17 +3273,16 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_and_b32 s0, s3, 1
+; GFX7-NEXT: s_lshr_b32 m0, s3, 1
; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshr_b32 m0, s3, 1
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_movrels_b32_e32 v0, v2
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: v_movreld_b32_e32 v2, v0
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0
@@ -3644,21 +3606,20 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX8-NEXT: v_mov_b32_e32 v6, s21
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v7, s22
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mov_b32_e32 v9, s23
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
@@ -3705,20 +3666,19 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: v_mov_b32_e32 v6, s21
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v7, s22
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mov_b32_e32 v9, s23
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v9, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s16
; GFX7-NEXT: v_mov_b32_e32 v1, s17
@@ -3936,20 +3896,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX8-NEXT: v_mov_b32_e32 v7, s17
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v9, s18
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v10, s19
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
@@ -3996,20 +3955,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
; GFX7-NEXT: v_mov_b32_e32 v7, s17
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mov_b32_e32 v9, s18
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mov_b32_e32 v10, s19
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v9, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s12
; GFX7-NEXT: v_mov_b32_e32 v1, s13
@@ -4216,7 +4174,6 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v11, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v12, 0
@@ -4231,7 +4188,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v15, v0, v15
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v15, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
@@ -4263,9 +4220,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX7-NEXT: s_mov_b64 s[16:17], 0
; GFX7-NEXT: s_mov_b32 s18, -1
@@ -4278,7 +4234,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7]
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9]
; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11]
-; GFX7-NEXT: v_and_b32_e32 v1, v11, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v11
; GFX7-NEXT: v_or_b32_e32 v11, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
@@ -4452,14 +4408,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_mov_b32_e32 v13, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v11, 16
; GFX8-NEXT: v_mov_b32_e32 v12, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_movrels_b32_e32 v13, v3
-; GFX8-NEXT: v_and_b32_e32 v13, s0, v13
+; GFX8-NEXT: v_bfi_b32 v13, s0, 0, v13
; GFX8-NEXT: v_or_b32_e32 v2, v13, v2
; GFX8-NEXT: v_movreld_b32_e32 v3, v2
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
@@ -4474,17 +4429,16 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: s_and_b32 s0, s2, 1
+; GFX7-NEXT: s_lshr_b32 m0, s2, 1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_lshr_b32 m0, s2, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_movrels_b32_e32 v1, v3
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, s0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_movreld_b32_e32 v3, v0
; GFX7-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
@@ -4611,7 +4565,6 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v12, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v13, 0
@@ -4626,7 +4579,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v16, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v16, s[12:13]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc
@@ -4654,13 +4607,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX7-NEXT: s_mov_b64 s[16:17], 0
; GFX7-NEXT: s_mov_b32 s18, -1
@@ -4673,7 +4625,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v12, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index fe7d421d27f8..4598bcc04a50 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -910,9 +910,8 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
; GFX8-NEXT: s_lshl_b32 s1, s1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, s1, v2
@@ -930,11 +929,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
; GFX7-NEXT: s_lshl_b32 s1, s1, s0
; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1089,9 +1087,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1106,9 +1103,8 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_and_b32 s1, s4, 0xff
; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -1180,9 +1176,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v1, v2
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
+; GFX8-NEXT: v_bfi_b32 v2, v0, 0, s0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1197,9 +1192,8 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, s0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
@@ -1269,12 +1263,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: s_and_b32 s0, s2, 0xff
-; GFX8-NEXT: v_not_b32_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v2, v0, v1
+; GFX8-NEXT: v_bfi_b32 v2, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1292,11 +1285,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1363,10 +1355,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX8-NEXT: s_not_b32 s0, s0
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT: v_bfi_b32 v3, s0, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1384,11 +1375,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
-; GFX7-NEXT: s_not_b32 s0, s0
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT: v_bfi_b32 v0, s0, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
@@ -1455,10 +1445,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_mov_b32_e32 v1, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT: v_bfi_b32 v3, v1, 0, v0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -1476,11 +1465,10 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_bfi_b32 v0, v1, 0, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -1683,19 +1671,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-LABEL: insertelement_v_v8i8_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_and_b32 s1, s3, 3
; GFX8-NEXT: s_lshr_b32 s0, s3, 2
+; GFX8-NEXT: s_and_b32 s1, s3, 3
; GFX8-NEXT: s_and_b32 s2, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 3
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX8-NEXT: s_lshl_b32 s2, s2, s1
; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s1, v4
+; GFX8-NEXT: v_bfi_b32 v4, s1, 0, v4
; GFX8-NEXT: v_or_b32_e32 v4, s2, v4
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
@@ -1709,19 +1696,18 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s3, 3
; GFX7-NEXT: s_lshr_b32 s0, s3, 2
+; GFX7-NEXT: s_and_b32 s1, s3, 3
; GFX7-NEXT: s_and_b32 s2, s2, 0xff
; GFX7-NEXT: s_lshl_b32 s1, s1, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: s_lshl_b32 s2, s2, s1
; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT: v_bfi_b32 v2, s1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v2, s2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -1953,8 +1939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v4
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -1980,8 +1965,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2091,8 +2075,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v4
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -2109,17 +2092,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1
; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v3, s0
; GFX7-NEXT: v_mov_b32_e32 v4, s1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2219,16 +2201,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_and_b32 s0, s2, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2245,16 +2226,15 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
; GFX7-NEXT: s_and_b32 s0, s2, 0xff
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_not_b32_e32 v2, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_bfi_b32 v2, v2, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2342,15 +2322,14 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: s_lshr_b32 s0, s2, 2
; GFX8-NEXT: s_lshl_b32 s1, s1, 3
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
+; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: s_not_b32 s1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, s1, v5
+; GFX8-NEXT: v_bfi_b32 v5, s1, 0, v5
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -2364,19 +2343,18 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_and_b32 s1, s2, 3
; GFX7-NEXT: s_lshr_b32 s0, s2, 2
+; GFX7-NEXT: s_and_b32 s1, s2, 3
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: s_lshl_b32 s1, s1, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2
; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_not_b32 s1, s1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT: v_bfi_b32 v3, s1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
@@ -2464,16 +2442,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
; GFX8-NEXT: v_mov_b32_e32 v6, 0xff
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT: v_not_b32_e32 v3, v3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, v6, v3
+; GFX8-NEXT: v_bfi_b32 v3, v3, 0, v6
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2490,16 +2467,15 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_mov_b32 s6, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_bfi_b32 v3, v3, 0, v5
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2773,14 +2749,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-LABEL: insertelement_v_v16i8_s_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s3, 3
; GFX8-NEXT: s_lshr_b32 s4, s3, 2
+; GFX8-NEXT: s_and_b32 s0, s3, 3
; GFX8-NEXT: s_and_b32 s1, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: s_lshl_b32 s5, s1, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s6, s0
+; GFX8-NEXT: s_lshl_b32 s5, s1, s0
+; GFX8-NEXT: s_lshl_b32 s6, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_mov_b32_e32 v4, 0
@@ -2789,7 +2764,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v6, s6, v6
+; GFX8-NEXT: v_bfi_b32 v6, s6, 0, v6
; GFX8-NEXT: v_or_b32_e32 v6, s5, v6
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
@@ -2805,14 +2780,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s3, 3
; GFX7-NEXT: s_lshr_b32 s4, s3, 2
+; GFX7-NEXT: s_and_b32 s0, s3, 3
; GFX7-NEXT: s_and_b32 s1, s2, 0xff
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
-; GFX7-NEXT: s_lshl_b32 s5, s1, s0
-; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s6, s0
+; GFX7-NEXT: s_lshl_b32 s5, s1, s0
+; GFX7-NEXT: s_lshl_b32 s6, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -2821,7 +2795,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v4, s6, v4
+; GFX7-NEXT: v_bfi_b32 v4, s6, 0, v4
; GFX7-NEXT: v_or_b32_e32 v4, s5, v4
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
@@ -3126,17 +3100,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_and_b32 s4, s4, 0xff
; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v0, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
@@ -3157,23 +3130,22 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: s_and_b32 s4, s4, 0xff
; GFX7-NEXT: v_mov_b32_e32 v5, s11
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT: s_and_b32 s4, s4, 0xff
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_bfi_b32 v0, v0, 0, v1
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s9
@@ -3304,23 +3276,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
; GFX8-NEXT: v_mov_b32_e32 v6, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_not_b32_e32 v1, v1
-; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3341,23 +3312,22 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_mov_b32_e32 v6, s7
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX7-NEXT: v_not_b32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v2
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -3491,7 +3461,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v7, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v8, 0
@@ -3499,7 +3468,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v2
; GFX8-NEXT: v_or_b32_e32 v9, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
@@ -3521,9 +3490,8 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
+; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -3531,7 +3499,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v7, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v7
; GFX7-NEXT: v_or_b32_e32 v7, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
@@ -3636,13 +3604,12 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-LABEL: insertelement_v_v16i8_v_s:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, s2, 3
; GFX8-NEXT: s_lshr_b32 s4, s2, 2
+; GFX8-NEXT: s_and_b32 s0, s2, 3
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX8-NEXT: s_not_b32 s5, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b32 s5, 0xff, s0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -3652,7 +3619,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX8-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
@@ -3668,14 +3635,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b32 s11, 0xf000
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: s_and_b32 s0, s2, 3
; GFX7-NEXT: s_lshr_b32 s4, s2, 2
+; GFX7-NEXT: s_and_b32 s0, s2, 3
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2
; GFX7-NEXT: s_lshl_b32 s0, s0, 3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
-; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
-; GFX7-NEXT: s_not_b32 s5, s0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_lshl_b32 s5, 0xff, s0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -3684,7 +3650,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
+; GFX7-NEXT: v_bfi_b32 v1, s5, 0, v1
; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
@@ -3798,7 +3764,6 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v8, 0
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GFX8-NEXT: v_mov_b32_e32 v9, 0
@@ -3806,7 +3771,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT: v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT: v_bfi_b32 v0, v0, 0, v3
; GFX8-NEXT: v_or_b32_e32 v3, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
@@ -3822,15 +3787,14 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s10, -1
@@ -3838,7 +3802,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX7-NEXT: v_bfi_b32 v1, v1, 0, v3
; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 51d0b225b2a2..533b25ef1a0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -2,7 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GPRIDX %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: not --crash llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not --crash llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
; FIXME: Need constant bus fixup pre-gfx10 for movrel
; ERR: Bad machine code: VOP* instruction violates constant bus restriction
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir
index 3ca3928fbfad..45a129283dfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.class.s16.mir
@@ -2,7 +2,7 @@
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s
# SI-ERR-NOT: remark
# SI-ERR: remark: <unknown>:0:0: cannot select: %3:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.class), %2:sgpr(s16), %1:vgpr(s32) (in function: class_s16_vcc_sv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir
index d6b8603bc2ae..94175c5f3037 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.cos.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), %1:sgpr(s16) (in function: cos_s16_vs)
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), %1:vgpr(s16) (in function: cos_s16_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir
index e2d2f1163047..5840f6255cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fmed3.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s
# VI-ERR-NOT: remark
# VI-ERR: remark: <unknown>:0:0: cannot select: %6:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), %3:vgpr(s16), %4:vgpr(s16), %5:vgpr(s16) (in function: fmed3_s16_vvvv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir
index 9feb4d831e07..64c4f875e971 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fract.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %1:sgpr(s16) (in function: fract_s16_vs)
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), %1:vgpr(s16) (in function: fract_s16_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir
index 9862d69e520c..32c018b8008e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.legacy.mir
@@ -1,8 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s
# VI-ERR: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), %0:sgpr(s32) (in function: rcp_legacy_s32_vs)
# VI-ERR-NEXT: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp.legacy), %0:vgpr(s32) (in function: rcp_legacy_s32_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir
index f9ec4364fd6f..1834177009c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rcp.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:sgpr(s16) (in function: rcp_s16_vs)
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:vgpr(s16) (in function: rcp_s16_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir
index ebe238aae019..61b40d69b250 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.clamp.mir
@@ -1,8 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s
# VI-ERR: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0:sgpr(s32) (in function: rsq_clamp_s32_vs)
# VI-ERR-NEXT: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0:vgpr(s32) (in function: rsq_clamp_s32_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir
index 7fd3909405bc..b4baad9cb743 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.legacy.mir
@@ -1,8 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=VI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=VI-ERR %s
# VI-ERR: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.legacy), %0:sgpr(s32) (in function: rsq_legacy_s32_vs)
# VI-ERR-NEXT: remark: <unknown>:0:0: cannot select: %1:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.legacy), %0:vgpr(s32) (in function: rsq_legacy_s32_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir
index ac1ff73ce802..fce84c451847 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.rsq.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %1:sgpr(s16) (in function: rsq_s16_vs)
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %1:vgpr(s16) (in function: rsq_s16_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir
index 0a9792f1807c..7ab374f5853a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.sin.s16.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=SI-ERR %s
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), %1:sgpr(s16) (in function: sin_s16_vs)
# SI-ERR: remark: <unknown>:0:0: cannot select: %2:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), %1:vgpr(s16) (in function: sin_s16_vv)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir
index 132596d186a6..15933fad211a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-packed.xfail.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
# Make sure v2s16 SALU operations fail to select
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir
new file mode 100644
index 000000000000..ace459979833
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: smax_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smax_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smax_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: smax_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_SMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smax_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smax_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir
new file mode 100644
index 000000000000..f341bdfb22ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: smin_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smin_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smin_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: smin_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_SMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smin_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smin_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir
index 33f14c179f2a..2df27bdd459d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir
@@ -2,6 +2,7 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
---
name: smin_s32_ss
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir
index b5f17dea5bb6..137f024f513a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-stacksave-stackrestore.invalid.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: not llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
# ERR: LLVM ERROR: cannot select: G_STACKRESTORE %{{[0-9]+}}:vgpr(p5) (in function: stackrestore_waveaddress_vgpr)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir
new file mode 100644
index 000000000000..9edcf573c833
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: umax_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umax_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umax_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: umax_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_UMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umax_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umax_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir
new file mode 100644
index 000000000000..e6c68112d067
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: umin_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umin_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umin_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: umin_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_UMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umin_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umin_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index fbec70d43b4d..f9d11cb23fa4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() {
define i32 @asm_vgpr_early_clobber() {
; CHECK-LABEL: name: asm_vgpr_early_clobber
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
@@ -94,7 +94,7 @@ entry:
define i32 @test_single_vgpr_output() nounwind {
; CHECK-LABEL: name: test_single_vgpr_output
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -106,7 +106,7 @@ entry:
define i32 @test_single_sgpr_output_s32() nounwind {
; CHECK-LABEL: name: test_single_sgpr_output_s32
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -119,7 +119,7 @@ entry:
define float @test_multiple_register_outputs_same() #0 {
; CHECK-LABEL: name: test_multiple_register_outputs_same
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 2228234 /* regdef:VGPR_32 */, def %9
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 2031626 /* regdef:VGPR_32 */, def %9
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
@@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 {
define double @test_multiple_register_outputs_mixed() #0 {
; CHECK-LABEL: name: test_multiple_register_outputs_mixed
; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 3538954 /* regdef:VReg_64 */, def %9
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 3670026 /* regdef:VReg_64 */, def %9
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY %9
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
@@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() {
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32)
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY1]]
; CHECK-NEXT: S_ENDPGM 0
call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42)
ret void
@@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() {
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32)
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[COPY1]]
; CHECK-NEXT: S_ENDPGM 0
call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42)
ret void
@@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
- ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2228233 /* reguse:VGPR_32 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %9, 2031625 /* reguse:VGPR_32 */, [[COPY1]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind {
; CHECK-NEXT: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
- ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
+ ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2031626 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9
; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
- ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+ ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %11
; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
define i32 @test_sgpr_matching_constraint() nounwind {
; CHECK-LABEL: name: test_sgpr_matching_constraint
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %10
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %10
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %10
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
- ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %12, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+ ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %12, 2621449 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY %12
; CHECK-NEXT: $vgpr0 = COPY [[COPY4]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
@@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
- ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2228234 /* regdef:VGPR_32 */, def %12, 2228234 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
+ ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11, 2031626 /* regdef:VGPR_32 */, def %12, 2031626 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY %11
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %12
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %13
@@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint
; CHECK: bb.1.entry:
- ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
+ ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
- ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10
; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir
index b7e52cadd8cd..d52b5e5370a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if-invalid.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
# Make sure incorrect usage of control flow intrinsics fails to select in case some transform separated the intrinsic from its branch.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
index 9716bb31db3f..4e8ab893c96f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.if.xfail.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -filetype=null 2>&1 | FileCheck -check-prefix=ERR %s
# Make sure there's no crash if there is somehow no successor block.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir
index 195ab02571bf..802f7f4946ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir
@@ -2,7 +2,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -O0 -run-pass=legalizer %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -O0 -run-pass=legalizer %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
# ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(s32) = G_ATOMICRMW_FADD %0:_(p1), %1:_ :: (load store seq_cst (s32), addrspace 1) (in function: atomicrmw_fadd_global_i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir
index 0d3ee3f69ab2..d8f588ff53c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir
@@ -1,8 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
# ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(s32) = G_ATOMICRMW_FADD %0:_(p3), %1:_ :: (load store seq_cst (s32), addrspace 3) (in function: atomicrmw_fadd_local_i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir
index 22970d311a34..123580c22fbd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-xchg-flat.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer -o - %s | FileCheck %s
-# RUN: not llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERROR %s
+# RUN: not llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR %s
# ERROR: LLVM ERROR: unable to legalize instruction: %2:_(s32) = G_ATOMICRMW_XCHG %0:_(p0), %1:_ :: (load store seq_cst (s32)) (in function: atomicrmw_xchg_flat_i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
index 0a15cc3824ae..5f610924a33c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
@@ -196,90 +196,49 @@ body: |
; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; SI-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+ ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
- ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; SI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
- ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
- ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
- ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[ZEXT]](s32)
- ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
- ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
- ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
- ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
- ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32)
- ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
- ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC]], [[TRUNC1]]
- ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
- ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
- ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
+ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+ ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
+ ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+ ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
+ ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+ ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]]
+ ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+ ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C3]](s32)
+ ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+ ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[ZEXT]](s32)
; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
- ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32)
- ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
- ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32)
- ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
- ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]]
- ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
- ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST2]], [[COPY4]](s32)
- ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR6]], [[COPY5]](s32)
- ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C1]](s32)
- ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY6]], [[SHL4]]
+ ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+ ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C4]]
+ ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32)
+ ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
+ ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]]
+ ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+ ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
+ ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+ ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+ ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY3]](s32)
+ ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+ ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[ZEXT2]](s32)
+ ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
+ ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
+ ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[ZEXT3]](s32)
+ ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+ ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC4]], [[TRUNC5]]
+ ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+ ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+ ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32)
+ ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL4]]
; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; SI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]]
- ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
- ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
- ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
- ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
- ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
- ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
- ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
- ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
- ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
- ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
- ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
- ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
- ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY8]](s32)
- ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
- ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[ZEXT5]](s32)
- ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
- ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC6]], [[TRUNC7]]
- ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
- ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
- ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
- ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
- ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
- ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
- ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
- ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
- ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY9]](s32)
- ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
- ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR10]], [[ZEXT7]](s32)
- ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
- ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]]
- ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
- ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
- ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32)
- ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]]
- ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
- ; SI-NEXT: $vgpr0 = COPY [[BITCAST5]](<2 x s16>)
+ ; SI-NEXT: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
;
; VI-LABEL: name: test_fshr_v2s16_v2s16
; VI: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -287,68 +246,42 @@ body: |
; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; VI-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
- ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+ ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
- ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+ ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
- ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
- ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
- ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16)
- ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16)
- ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL]], [[LSHR3]]
- ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
- ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
- ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16)
- ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16)
- ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR5]]
- ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+ ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
- ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
- ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16)
- ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16)
- ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32)
- ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]]
+ ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]]
+ ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+ ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C2]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+ ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+ ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16)
+ ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16)
+ ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[AND]](s16)
+ ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR3]]
+ ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]]
+ ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C2]]
+ ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+ ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16)
+ ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[SHL2]], [[AND3]](s16)
+ ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[AND2]](s16)
+ ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL3]], [[LSHR4]]
+ ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+ ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+ ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+ ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; VI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]]
- ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
- ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
- ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
- ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]]
- ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]]
- ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
- ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16)
- ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16)
- ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16)
- ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR9]]
- ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]]
- ; VI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]]
- ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
- ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16)
- ; VI-NEXT: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16)
- ; VI-NEXT: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16)
- ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR11]]
- ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
- ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
- ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
- ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]]
- ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
- ; VI-NEXT: $vgpr0 = COPY [[BITCAST5]](<2 x s16>)
+ ; VI-NEXT: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
;
; GFX9-LABEL: name: test_fshr_v2s16_v2s16
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -699,136 +632,75 @@ body: |
; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
- ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
- ; SI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
- ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
- ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
- ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
- ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[ZEXT]](s32)
- ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
- ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C5]]
- ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
- ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
- ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32)
- ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
- ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC]], [[TRUNC1]]
- ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
- ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
- ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
- ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
- ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
- ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
- ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY6]](s32)
- ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
- ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32)
- ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
- ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]]
- ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST2]], [[COPY7]](s32)
- ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[COPY8]](s32)
- ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C]](s32)
- ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL4]]
- ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; SI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY4]], [[BITCAST4]]
- ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
- ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
- ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
- ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
- ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
- ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
- ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
- ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
- ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
- ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
- ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
- ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
- ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY11]](s32)
- ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
- ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR7]], [[ZEXT5]](s32)
- ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32)
- ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC6]], [[TRUNC7]]
- ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
- ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
- ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
- ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
- ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
- ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
- ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
- ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
- ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY12]](s32)
- ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
- ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[ZEXT7]](s32)
- ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32)
- ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]]
- ; SI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
- ; SI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
- ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
- ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[AND11]](s16)
- ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[ZEXT8]](s32)
- ; SI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32)
- ; SI-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C5]]
- ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY13]](s32)
- ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
- ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[LSHR11]], [[ZEXT9]](s32)
- ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32)
- ; SI-NEXT: [[OR5:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC10]], [[TRUNC11]]
- ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[BITCAST3]], [[COPY14]](s32)
- ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C]](s32)
- ; SI-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[SHL9]]
- ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32)
- ; SI-NEXT: [[XOR6:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY5]], [[BITCAST6]]
- ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[XOR6]](<2 x s16>)
- ; SI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32)
- ; SI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C2]]
- ; SI-NEXT: [[XOR7:%[0-9]+]]:_(s16) = G_XOR [[TRUNC12]], [[C3]]
- ; SI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR7]], [[C2]]
- ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND14]](s16)
- ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR5]](s16)
- ; SI-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT10]](s32)
- ; SI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32)
- ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[SHL8]], [[C5]]
- ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND16]], [[COPY17]](s32)
- ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND15]](s16)
- ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR13]], [[ZEXT11]](s32)
- ; SI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR14]](s32)
- ; SI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC13]], [[TRUNC14]]
+ ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+ ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+ ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+ ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+ ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
+ ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
+ ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+ ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]]
+ ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+ ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C3]](s32)
+ ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+ ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[ZEXT]](s32)
+ ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
+ ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+ ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C4]]
+ ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32)
+ ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
+ ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC3]], [[TRUNC4]]
+ ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+ ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
+ ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+ ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+ ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY6]](s32)
+ ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+ ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[ZEXT2]](s32)
+ ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
+ ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
+ ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[ZEXT3]](s32)
+ ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+ ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC5]], [[TRUNC6]]
+ ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]]
+ ; SI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]]
+ ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+ ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+ ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[COPY7]](s32)
+ ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
+ ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[ZEXT4]](s32)
+ ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
+ ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
+ ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
+ ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT5]](s32)
+ ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
+ ; SI-NEXT: [[OR2:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC7]], [[TRUNC8]]
; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
; SI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
- ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
- ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32)
- ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
- ; SI-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
- ; SI-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
- ; SI-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C]](s32)
- ; SI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL11]]
- ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
- ; SI-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
- ; SI-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[BITCAST8]], [[C5]]
- ; SI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C]](s32)
- ; SI-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL12]]
- ; SI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32)
- ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C5]]
- ; SI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C]](s32)
- ; SI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[LSHR15]], [[SHL13]]
- ; SI-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32)
- ; SI-NEXT: $vgpr0 = COPY [[BITCAST10]](<2 x s16>)
- ; SI-NEXT: $vgpr1 = COPY [[BITCAST11]](<2 x s16>)
- ; SI-NEXT: $vgpr2 = COPY [[BITCAST12]](<2 x s16>)
+ ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+ ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+ ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+ ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+ ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C]](s32)
+ ; SI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL6]]
+ ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+ ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+ ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST6]], [[C4]]
+ ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+ ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]]
+ ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+ ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]]
+ ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
+ ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL8]]
+ ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+ ; SI-NEXT: $vgpr0 = COPY [[BITCAST8]](<2 x s16>)
+ ; SI-NEXT: $vgpr1 = COPY [[BITCAST9]](<2 x s16>)
+ ; SI-NEXT: $vgpr2 = COPY [[BITCAST10]](<2 x s16>)
;
; VI-LABEL: name: test_fshr_v3s16_v3s16
; VI: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -852,94 +724,59 @@ body: |
; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>)
; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
- ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
- ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
- ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
- ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
- ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
- ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16)
- ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16)
- ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL]], [[LSHR3]]
- ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
- ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
- ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
- ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
- ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C1]](s16)
- ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16)
- ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR5]]
- ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C1]](s16)
- ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C1]](s16)
- ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; VI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C]](s32)
- ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY6]], [[SHL4]]
- ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; VI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY4]], [[BITCAST4]]
- ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
- ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
- ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
- ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
- ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]]
- ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]]
- ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
- ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16)
- ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C1]](s16)
- ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[LSHR7]], [[AND5]](s16)
- ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR8]]
- ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]]
- ; VI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]]
- ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
- ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16)
- ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C1]](s16)
- ; VI-NEXT: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[LSHR9]], [[AND7]](s16)
- ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR10]]
- ; VI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
- ; VI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
- ; VI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
- ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND8]](s16)
- ; VI-NEXT: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C1]](s16)
- ; VI-NEXT: [[LSHR12:%[0-9]+]]:_(s16) = G_LSHR [[LSHR11]], [[AND9]](s16)
- ; VI-NEXT: [[OR5:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL7]], [[LSHR12]]
- ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C1]](s16)
- ; VI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; VI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; VI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C]](s32)
- ; VI-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY7]], [[SHL9]]
- ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32)
- ; VI-NEXT: [[XOR6:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY5]], [[BITCAST6]]
- ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[XOR6]](<2 x s16>)
- ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32)
- ; VI-NEXT: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C2]]
- ; VI-NEXT: [[XOR7:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C3]]
- ; VI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[XOR7]], [[C2]]
- ; VI-NEXT: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[OR5]], [[AND10]](s16)
- ; VI-NEXT: [[LSHR13:%[0-9]+]]:_(s16) = G_LSHR [[SHL8]], [[C1]](s16)
- ; VI-NEXT: [[LSHR14:%[0-9]+]]:_(s16) = G_LSHR [[LSHR13]], [[AND11]](s16)
- ; VI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL10]], [[LSHR14]]
+ ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>)
+ ; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
+ ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+ ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>)
+ ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]]
+ ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+ ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C2]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+ ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+ ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16)
+ ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16)
+ ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[AND]](s16)
+ ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR3]]
+ ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]]
+ ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C2]]
+ ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+ ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16)
+ ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[SHL2]], [[AND3]](s16)
+ ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[AND2]](s16)
+ ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL3]], [[LSHR4]]
+ ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C1]]
+ ; VI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C2]]
+ ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+ ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16)
+ ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[SHL4]], [[AND5]](s16)
+ ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[AND4]](s16)
+ ; VI-NEXT: [[OR2:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR5]]
; VI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
- ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
- ; VI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32)
- ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
- ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
- ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
- ; VI-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
- ; VI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL11]]
- ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
- ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
- ; VI-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[BITCAST8]], [[C4]]
- ; VI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32)
- ; VI-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL12]]
- ; VI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32)
- ; VI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C4]]
- ; VI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C]](s32)
- ; VI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[LSHR15]], [[SHL13]]
- ; VI-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32)
- ; VI-NEXT: $vgpr0 = COPY [[BITCAST10]](<2 x s16>)
- ; VI-NEXT: $vgpr1 = COPY [[BITCAST11]](<2 x s16>)
- ; VI-NEXT: $vgpr2 = COPY [[BITCAST12]](<2 x s16>)
+ ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+ ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+ ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+ ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+ ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+ ; VI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]]
+ ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
+ ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+ ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; VI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST6]], [[C4]]
+ ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+ ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]]
+ ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
+ ; VI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]]
+ ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+ ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL8]]
+ ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+ ; VI-NEXT: $vgpr0 = COPY [[BITCAST8]](<2 x s16>)
+ ; VI-NEXT: $vgpr1 = COPY [[BITCAST9]](<2 x s16>)
+ ; VI-NEXT: $vgpr2 = COPY [[BITCAST10]](<2 x s16>)
;
; GFX9-LABEL: name: test_fshr_v3s16_v3s16
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -1026,168 +863,87 @@ body: |
; SI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
; SI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
; SI-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
- ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
- ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+ ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
- ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
- ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; SI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
- ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
- ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
- ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[ZEXT]](s32)
- ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
- ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
- ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
- ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
- ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32)
- ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
- ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC]], [[TRUNC1]]
- ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
- ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
- ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
+ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+ ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
+ ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+ ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; SI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
+ ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+ ; SI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]]
+ ; SI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+ ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C3]](s32)
+ ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
+ ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[ZEXT]](s32)
; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
- ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32)
- ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
- ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32)
- ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
- ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]]
- ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
- ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
- ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST2]], [[COPY4]](s32)
- ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR6]], [[COPY5]](s32)
- ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C1]](s32)
- ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY6]], [[SHL4]]
+ ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16)
+ ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C4]]
+ ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32)
+ ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
+ ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC2]], [[TRUNC3]]
+ ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+ ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
+ ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+ ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+ ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY3]](s32)
+ ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+ ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[ZEXT2]](s32)
+ ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
+ ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
+ ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[ZEXT3]](s32)
+ ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
+ ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC4]], [[TRUNC5]]
+ ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+ ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+ ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32)
+ ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL4]]
; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; SI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]]
- ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
- ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
- ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
- ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
- ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
- ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
- ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
- ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
- ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
- ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
- ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
- ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
- ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY8]](s32)
- ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
- ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[ZEXT5]](s32)
- ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
- ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC6]], [[TRUNC7]]
- ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
- ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
- ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
- ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
- ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
- ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
+ ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+ ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+ ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>)
+ ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32)
+ ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+ ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+ ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]]
+ ; SI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C2]]
+ ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+ ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+ ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[BITCAST4]], [[COPY4]](s32)
+ ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
+ ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[SHL5]], [[ZEXT6]](s32)
; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
- ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
- ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY9]](s32)
- ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
- ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR10]], [[ZEXT7]](s32)
- ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
- ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]]
- ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
- ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
- ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32)
- ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]]
- ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
- ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
- ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32)
- ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
- ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32)
- ; SI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; SI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
- ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND11]](s16)
- ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[BITCAST6]], [[ZEXT10]](s32)
+ ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
+ ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C4]]
+ ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT7]](s32)
+ ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32)
+ ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC8]], [[TRUNC9]]
+ ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]]
+ ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C2]]
+ ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]]
+ ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+ ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[COPY5]](s32)
+ ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+ ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[SHL7]], [[ZEXT8]](s32)
; SI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32)
- ; SI-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C5]]
- ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY10]](s32)
- ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
- ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[LSHR14]], [[ZEXT11]](s32)
- ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32)
- ; SI-NEXT: [[OR6:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC10]], [[TRUNC11]]
- ; SI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; SI-NEXT: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; SI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]]
- ; SI-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[AND14]](s16)
- ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[LSHR12]], [[ZEXT12]](s32)
- ; SI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32)
- ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[LSHR13]], [[COPY11]](s32)
- ; SI-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[AND15]](s16)
- ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[LSHR16]], [[ZEXT13]](s32)
- ; SI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32)
- ; SI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC12]], [[TRUNC13]]
- ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
- ; SI-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32)
- ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[BITCAST8]], [[COPY12]](s32)
- ; SI-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LSHR18]], [[COPY13]](s32)
- ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
- ; SI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32)
- ; SI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL12]]
- ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
- ; SI-NEXT: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]]
- ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>)
- ; SI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32)
- ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32)
- ; SI-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32)
- ; SI-NEXT: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]]
- ; SI-NEXT: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]]
- ; SI-NEXT: [[AND17:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]]
- ; SI-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16)
- ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR6]](s16)
- ; SI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT14]](s32)
- ; SI-NEXT: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32)
- ; SI-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[SHL10]], [[C5]]
- ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[COPY16]](s32)
- ; SI-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[AND17]](s16)
- ; SI-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[LSHR20]], [[ZEXT15]](s32)
- ; SI-NEXT: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32)
- ; SI-NEXT: [[OR9:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC16]], [[TRUNC17]]
- ; SI-NEXT: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]]
- ; SI-NEXT: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]]
- ; SI-NEXT: [[AND20:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]]
- ; SI-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[AND19]](s16)
- ; SI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR7]](s16)
- ; SI-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT3]], [[ZEXT16]](s32)
- ; SI-NEXT: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32)
- ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[AND21:%[0-9]+]]:_(s32) = G_AND [[SHL11]], [[C5]]
- ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND21]], [[COPY17]](s32)
- ; SI-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16)
- ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[LSHR22]], [[ZEXT17]](s32)
- ; SI-NEXT: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32)
- ; SI-NEXT: [[OR10:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC18]], [[TRUNC19]]
- ; SI-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
- ; SI-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
- ; SI-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT19]], [[C1]](s32)
- ; SI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT18]], [[SHL15]]
- ; SI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32)
- ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>)
+ ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+ ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[ZEXT9]](s32)
+ ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
+ ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[TRUNC10]], [[TRUNC11]]
+ ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+ ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+ ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C]](s32)
+ ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL9]]
+ ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+ ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST7]](<2 x s16>)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
;
; VI-LABEL: name: test_fshr_v4s16_v4s16
@@ -1199,125 +955,73 @@ body: |
; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
; VI-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
; VI-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>)
- ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
- ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+ ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
- ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+ ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
- ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]]
- ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
- ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16)
- ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16)
- ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL]], [[LSHR3]]
- ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
- ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
- ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16)
- ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16)
- ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR5]]
- ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+ ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
- ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
- ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16)
- ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16)
- ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32)
- ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]]
+ ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]]
+ ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
+ ; VI-NEXT: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C2]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]]
+ ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+ ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16)
+ ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16)
+ ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[AND]](s16)
+ ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL1]], [[LSHR3]]
+ ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]]
+ ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C2]]
+ ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+ ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16)
+ ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[SHL2]], [[AND3]](s16)
+ ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[AND2]](s16)
+ ; VI-NEXT: [[OR1:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL3]], [[LSHR4]]
+ ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+ ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+ ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+ ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
- ; VI-NEXT: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]]
- ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>)
+ ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
; VI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
- ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
- ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]]
- ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]]
- ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
- ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16)
- ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16)
- ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16)
- ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL5]], [[LSHR9]]
- ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]]
- ; VI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]]
- ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
- ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16)
- ; VI-NEXT: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16)
- ; VI-NEXT: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16)
- ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR11]]
- ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
- ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
- ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
- ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]]
- ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
- ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
- ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32)
- ; VI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32)
- ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
- ; VI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32)
- ; VI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32)
- ; VI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; VI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; VI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
- ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[TRUNC8]], [[AND8]](s16)
- ; VI-NEXT: [[LSHR14:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC10]], [[C]](s16)
- ; VI-NEXT: [[LSHR15:%[0-9]+]]:_(s16) = G_LSHR [[LSHR14]], [[AND9]](s16)
- ; VI-NEXT: [[OR6:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL8]], [[LSHR15]]
- ; VI-NEXT: [[AND10:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
- ; VI-NEXT: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
- ; VI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]]
- ; VI-NEXT: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[TRUNC9]], [[AND10]](s16)
- ; VI-NEXT: [[LSHR16:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC11]], [[C]](s16)
- ; VI-NEXT: [[LSHR17:%[0-9]+]]:_(s16) = G_LSHR [[LSHR16]], [[AND11]](s16)
- ; VI-NEXT: [[OR7:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL9]], [[LSHR17]]
- ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
- ; VI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32)
- ; VI-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32)
- ; VI-NEXT: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[TRUNC12]], [[C]](s16)
- ; VI-NEXT: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[TRUNC13]], [[C]](s16)
- ; VI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; VI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; VI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C1]](s32)
- ; VI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL12]]
- ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
- ; VI-NEXT: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]]
- ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>)
- ; VI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32)
- ; VI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32)
- ; VI-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32)
- ; VI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]]
- ; VI-NEXT: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]]
- ; VI-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]]
- ; VI-NEXT: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[OR6]], [[AND12]](s16)
- ; VI-NEXT: [[LSHR20:%[0-9]+]]:_(s16) = G_LSHR [[SHL10]], [[C]](s16)
- ; VI-NEXT: [[LSHR21:%[0-9]+]]:_(s16) = G_LSHR [[LSHR20]], [[AND13]](s16)
- ; VI-NEXT: [[OR9:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL13]], [[LSHR21]]
- ; VI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]]
- ; VI-NEXT: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]]
- ; VI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]]
- ; VI-NEXT: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[OR7]], [[AND14]](s16)
- ; VI-NEXT: [[LSHR22:%[0-9]+]]:_(s16) = G_LSHR [[SHL11]], [[C]](s16)
- ; VI-NEXT: [[LSHR23:%[0-9]+]]:_(s16) = G_LSHR [[LSHR22]], [[AND15]](s16)
- ; VI-NEXT: [[OR10:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL14]], [[LSHR23]]
- ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
- ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
- ; VI-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C1]](s32)
- ; VI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL15]]
- ; VI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32)
- ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>)
+ ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
+ ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
+ ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; VI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
+ ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
+ ; VI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
+ ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>)
+ ; VI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32)
+ ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32)
+ ; VI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
+ ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C1]]
+ ; VI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC10]], [[C2]]
+ ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+ ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[TRUNC6]], [[C3]](s16)
+ ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[SHL5]], [[AND5]](s16)
+ ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC8]], [[AND4]](s16)
+ ; VI-NEXT: [[OR3:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL6]], [[LSHR8]]
+ ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C1]]
+ ; VI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC11]], [[C2]]
+ ; VI-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]]
+ ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[TRUNC7]], [[C3]](s16)
+ ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[SHL7]], [[AND7]](s16)
+ ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC9]], [[AND6]](s16)
+ ; VI-NEXT: [[OR4:%[0-9]+]]:_(s16) = disjoint G_OR [[SHL8]], [[LSHR9]]
+ ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+ ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+ ; VI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
+ ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]]
+ ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
+ ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST7]](<2 x s16>)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
;
; GFX9-LABEL: name: test_fshr_v4s16_v4s16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir
index b43c18ee2aa3..80737815cc16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: not llc -mtriple=amdgcn -run-pass=legalizer -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple=amdgcn -run-pass=legalizer -filetype=null %s 2>&1 | FileCheck %s
# CHECK: LLVM ERROR: unable to legalize instruction: %3:_(p0) = G_JUMP_TABLE %jump-table.0 (in function: jt_test)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
index aebda3f28d5f..cbd9c2173b7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
@@ -2,6 +2,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
---
name: s_buffer_load_s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
index db11855d2967..45714fd99d7b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_smax_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_SMAX %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[SMAX]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_SMAX %0, %1
@@ -115,6 +132,17 @@ body: |
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -165,6 +193,19 @@ body: |
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -209,6 +250,16 @@ body: |
; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 17
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -259,6 +310,18 @@ body: |
; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_SMAX %0, %1
@@ -309,6 +372,19 @@ body: |
; GFX9-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_SMAX %0, %1
@@ -375,6 +451,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_SMAX %0, %1
@@ -461,6 +545,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[SMAX]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_SMAX %0, %1
@@ -568,6 +672,18 @@ body: |
; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_SMAX %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
index d366242db087..88fe5d0d5433 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_smin_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_SMIN %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[SMIN]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_SMIN %0, %1
@@ -115,6 +132,17 @@ body: |
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -165,6 +193,19 @@ body: |
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -209,6 +250,16 @@ body: |
; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 17
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -259,6 +310,18 @@ body: |
; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_SMIN %0, %1
@@ -309,6 +372,19 @@ body: |
; GFX9-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_SMIN %0, %1
@@ -375,6 +451,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_SMIN %0, %1
@@ -461,6 +545,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[SMIN]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_SMIN %0, %1
@@ -568,6 +672,18 @@ body: |
; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_SMIN %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
index e8fa4e9d822f..32b526e28912 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_umax_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_UMAX %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s64) = G_UMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[UMAX]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_UMAX %0, %1
@@ -116,6 +133,17 @@ body: |
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -169,6 +197,20 @@ body: |
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[AND]], [[AND1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[AND]], [[AND1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -216,6 +258,17 @@ body: |
; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 131071
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -266,6 +319,18 @@ body: |
; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_UMAX %0, %1
@@ -316,6 +381,19 @@ body: |
; GFX9-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_UMAX %0, %1
@@ -378,6 +456,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_UMAX %0, %1
@@ -463,6 +549,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UMAX]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_UMAX %0, %1
@@ -562,6 +668,18 @@ body: |
; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_UMAX %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
index 8ee0df5ce670..8666c29c99d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_umin_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_UMIN %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[UMIN]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_UMIN %0, %1
@@ -116,6 +133,17 @@ body: |
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -169,6 +197,20 @@ body: |
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[AND]], [[AND1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[AND]], [[AND1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -216,6 +258,17 @@ body: |
; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 131071
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -266,6 +319,18 @@ body: |
; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_UMIN %0, %1
@@ -316,6 +381,19 @@ body: |
; GFX9-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_UMIN %0, %1
@@ -378,6 +456,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_UMIN %0, %1
@@ -463,6 +549,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UMIN]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_UMIN %0, %1
@@ -562,6 +668,18 @@ body: |
; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_UMIN %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 7916267c6eca..800df8987703 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -2,6 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250
declare i16 @llvm.abs.i16(i16, i1)
declare i32 @llvm.abs.i32(i32, i1)
@@ -13,11 +14,30 @@ declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i16:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_sext_i32_i16 s0, s0
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
@@ -32,14 +52,42 @@ define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
}
define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i64:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_ashr_i32 s2, s1, 31
-; GFX-NEXT: s_add_u32 s0, s0, s2
-; GFX-NEXT: s_mov_b32 s3, s2
-; GFX-NEXT: s_addc_u32 s1, s1, s2
-; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_i64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_ashr_i32 s2, s1, 31
+; GFX6-NEXT: s_add_u32 s0, s0, s2
+; GFX6-NEXT: s_mov_b32 s3, s2
+; GFX6-NEXT: s_addc_u32 s1, s1, s2
+; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_ashr_i32 s2, s1, 31
+; GFX8-NEXT: s_add_u32 s0, s0, s2
+; GFX8-NEXT: s_mov_b32 s3, s2
+; GFX8-NEXT: s_addc_u32 s1, s1, s2
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_ashr_i32 s2, s1, 31
+; GFX10-NEXT: s_add_u32 s0, s0, s2
+; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_addc_u32 s1, s1, s2
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mov_b32 s3, s2
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
@@ -78,6 +126,14 @@ define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_max_i16 v0, v0, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
@@ -103,6 +159,14 @@ define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
ret i32 %res
}
@@ -140,6 +204,20 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
@@ -192,6 +270,24 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
+; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX1250-NEXT: v_max_i32_e32 v1, v1, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
ret <4 x i32> %res
}
@@ -243,6 +339,21 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v2i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_max_i16 v0, v0, v2
+; GFX1250-NEXT: v_max_i16 v1, v1, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
ret <2 x i8> %res
}
@@ -307,6 +418,27 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v3i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2
+; GFX1250-NEXT: v_max_i16 v0, v0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_max_i16 v1, v1, v4
+; GFX1250-NEXT: v_max_i16 v2, v2, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
ret <3 x i8> %res
}
@@ -341,6 +473,16 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
; GFX10-NEXT: s_abs_i32 s0, s0
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s1, s0
+; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -375,6 +517,14 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -416,6 +566,17 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
; GFX10-NEXT: s_abs_i32 s1, s1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s2, s0
+; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
+; GFX1250-NEXT: s_abs_i32 s2, s2
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_sext_i32_i16 s1, s1
+; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
@@ -460,6 +621,18 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX1250-NEXT: v_max_i16 v1, v1, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index a09703285087..bd6634f25077 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -358,12 +358,12 @@ main_body:
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
; GFX11-TRUE16: ; %bb.0: ; %main_body
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
-; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
+; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v2, v3 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
@@ -383,12 +383,12 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #
;
; GFX12-TRUE16-LABEL: v_interp_f16_imm_params:
; GFX12-TRUE16: ; %bb.0: ; %main_body
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
-; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
+; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v2, v3 wait_exp:7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 2b595b9bbecc..b0ca1e8ef3df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -2,7 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1013 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: not llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s -filetype=null 2>&1 | FileCheck -check-prefix=ERR %s
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 6bb104311a4d..ab8d8c192187 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
@@ -64,6 +66,52 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
+; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
+; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -256,6 +304,34 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v4, v[0:1], off offset:4
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -346,16 +422,35 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align4:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align4:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align4:
; GFX9: ; %bb.0:
@@ -392,16 +487,35 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
}
define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_i96_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_i96_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_i96_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_i96_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_i96_align8:
; GFX9: ; %bb.0:
@@ -438,16 +552,35 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align8:
; GFX9: ; %bb.0:
@@ -484,16 +617,35 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
}
define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v6i16_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v6i16_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v6i16_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v6i16_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v6i16_align8:
; GFX9: ; %bb.0:
@@ -539,28 +691,67 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
}
define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v12i8_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
-; GFX12-NEXT: v_mov_b32_e32 v8, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v12
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v12i8_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GFX12-UNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v8, v2
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v12i8_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GFX12-NOUNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, v2
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v12i8_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v13, 8, v0 :: v_dual_lshrrev_b32 v12, 16, v0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 24, v0 :: v_dual_lshrrev_b32 v5, 8, v1
+; GFX1250-NEXT: v_dual_lshrrev_b32 v6, 16, v1 :: v_dual_lshrrev_b32 v7, 24, v1
+; GFX1250-NEXT: v_dual_lshrrev_b32 v9, 8, v2 :: v_dual_lshrrev_b32 v10, 16, v2
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_lshrrev_b32 v11, 24, v2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v1, v13
+; GFX1250-NEXT: v_mov_b32_e32 v2, v12
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v12i8_align8:
; GFX9: ; %bb.0:
@@ -632,16 +823,35 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align16:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align16:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align16:
; GFX9: ; %bb.0:
@@ -720,6 +930,53 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
+; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s5, s[0:1], 0x5
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s6, s[0:1], 0x7
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s7, s[0:1], 0x6
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s8, s[0:1], 0x9
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s9, s[0:1], 0xb
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
+; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
@@ -916,6 +1173,34 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
+; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s4, s[0:1], 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s5, s[0:1], 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s6, s[0:1], 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s7, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s5
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s6
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7
+; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 6baa10bb4862..8533e34ff13f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -807,10 +807,10 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
; GFX8-LABEL: v_lshr_v2i16_15:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, 15
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0
-; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 31
+; GFX8-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b16_e32 v0, 15, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_lshr_v2i16_15:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 07d5ff2036d9..b75eb737534e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -1379,45 +1379,43 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
+; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], 2
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
-; GFX6-NEXT: v_mov_b32_e32 v2, 2
+; GFX6-NEXT: v_mov_b32_e32 v0, 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, 2
+; GFX7-NEXT: v_mov_b32_e32 v0, 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v4, 2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
-; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: v_mov_b32_e32 v2, 2
+; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index afabc7b62386..917b50f14bfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -99,15 +99,13 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
; GCN-LABEL: v_orn2_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_bfi_b32 v0, v1, v0, -1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_orn2_i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v1, v0, -1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i32 %src1, -1
%or = or i32 %src0, %not.src1
@@ -117,14 +115,12 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
; GCN-LABEL: v_orn2_i32_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_or_b32_e32 v0, s2, v0
+; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i32_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%or = or i32 %src0, %not.src1
@@ -135,14 +131,12 @@ define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
define amdgpu_ps float @v_orn2_i32_vs(i32 %src0, i32 inreg %src1) {
; GCN-LABEL: v_orn2_i32_vs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_not_b32 s0, s2
-; GCN-NEXT: v_or_b32_e32 v0, s0, v0
+; GCN-NEXT: v_bfi_b32 v0, s2, v0, -1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i32_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_not_b32 s0, s2
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10PLUS-NEXT: v_bfi_b32 v0, s2, v0, -1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i32 %src1, -1
%or = or i32 %src0, %not.src1
@@ -247,19 +241,15 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
; GCN-LABEL: v_orn2_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_not_b32_e32 v2, v2
-; GCN-NEXT: v_not_b32_e32 v3, v3
-; GCN-NEXT: v_or_b32_e32 v0, v0, v2
-; GCN-NEXT: v_or_b32_e32 v1, v1, v3
+; GCN-NEXT: v_bfi_b32 v0, v2, v0, -1
+; GCN-NEXT: v_bfi_b32 v1, v3, v1, -1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_orn2_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
-; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, v0, -1
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, v1, -1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i64 %src1, -1
%or = or i64 %src0, %not.src1
@@ -269,18 +259,14 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) {
; GCN-LABEL: v_orn2_i64_sv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: v_not_b32_e32 v1, v1
-; GCN-NEXT: v_or_b32_e32 v0, s2, v0
-; GCN-NEXT: v_or_b32_e32 v1, s3, v1
+; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1
+; GCN-NEXT: v_bfi_b32 v1, v1, s3, -1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i64_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
-; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
-; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1
+; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, s3, -1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%or = or i64 %src0, %not.src1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
new file mode 100644
index 000000000000..f2d3272e8727
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
@@ -0,0 +1,77 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s
+
+# COM: Check that the pass doesn't crash.
+
+---
+name: test_inline_asm
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+ dx10-clamp: true
+body: |
+ bb.1 :
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_inline_asm
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %5(s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ %2:vgpr(s32) = COPY %1(s32)
+ %3:vgpr(s32) = G_FMUL %0, %2
+ %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %5:vgpr_32
+ %6:vgpr(s32) = COPY %4(s32)
+ %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32)
+ $vgpr0 = COPY %7(s32)
+...
+
+---
+name: test_unmerge_values
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+ dx10-clamp: true
+body: |
+ bb.1 :
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_unmerge_values
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], [[C2]], [[COPY2]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[C2]](s32)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ %2:vgpr(s32) = COPY %1(s32)
+ %3:vgpr(s32) = G_FMUL %0, %2
+ %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ %5:vgpr(s64) = G_CONSTANT i64 123456789
+ %6:vgpr(s32), %7:vgpr(s32) = G_UNMERGE_VALUES %5(s64)
+ %8:vgpr(s32) = COPY %4(s32)
+ %9:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %7(s32), %8(s32)
+ $vgpr0 = COPY %7(s32)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index 89681e7329e7..c82f7c53696d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -2,6 +2,7 @@
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -check-prefix=GFX7
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s -check-prefix=GFX7
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -simplify-mir -stop-after=regbankselect -o - %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -simplify-mir -stop-after=regbankselect -o - %s | FileCheck %s -check-prefix=GFX12
; Natural mapping
define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index 1b64099d6bf5..e448c4cba094 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX7
# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s -check-prefixes=GCN,GFX12
--- |
define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
index 997ac804f710..b2ff0995ce57 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX7 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s
--- |
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 832f066adaa8..2f956d7a0a53 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -229,21 +229,23 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT: v_min_i32_e32 v5, 0, v0
+; GFX6-NEXT: v_min_i32_e32 v6, 0, v0
+; GFX6-NEXT: v_bfrev_b32_e32 v7, 1
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX6-NEXT: v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v7, v6
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
-; GFX6-NEXT: v_max_i32_e32 v1, v5, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v6, v1
; GFX6-NEXT: v_min_i32_e32 v1, v1, v4
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
; GFX6-NEXT: v_min_i32_e32 v4, 0, v1
+; GFX6-NEXT: v_bfrev_b32_e32 v5, -2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
; GFX6-NEXT: v_max_i32_e32 v3, 0, v1
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x80000000, v4
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v5, v3
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v3
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
@@ -2951,20 +2953,22 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
; GFX6-LABEL: saddsat_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_min_i32_e32 v3, 0, v0
+; GFX6-NEXT: v_min_i32_e32 v4, 0, v0
+; GFX6-NEXT: v_bfrev_b32_e32 v5, 1
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
; GFX6-NEXT: v_max_i32_e32 v2, 0, v0
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v5, v4
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
-; GFX6-NEXT: v_max_i32_e32 v3, s0, v3
+; GFX6-NEXT: v_max_i32_e32 v4, s0, v4
+; GFX6-NEXT: v_min_i32_e32 v2, v4, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_min_i32_e32 v2, v3, v2
-; GFX6-NEXT: v_min_i32_e32 v3, 0, v1
+; GFX6-NEXT: v_bfrev_b32_e32 v3, -2
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: s_lshl_b32 s0, s1, 16
; GFX6-NEXT: v_max_i32_e32 v2, 0, v1
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT: v_min_i32_e32 v3, 0, v1
+; GFX6-NEXT: s_lshl_b32 s0, s1, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
; GFX6-NEXT: v_max_i32_e32 v3, s0, v3
; GFX6-NEXT: v_min_i32_e32 v2, v3, v2
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 8d8eca162257..19dc20c51004 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1067,24 +1067,24 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
-; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
+; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
+; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
-; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x1000, v4
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
+; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
@@ -1660,24 +1660,24 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
-; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
+; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
+; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
-; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x12d8fb, v4
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
+; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 2673ac4fb5ba..c1b225562b77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -233,16 +233,17 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4
-; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT: v_bfrev_b32_e32 v6, 1
-; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GFX6-NEXT: v_min_i32_e32 v6, -1, v0
+; GFX6-NEXT: v_bfrev_b32_e32 v7, 1
+; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX6-NEXT: v_min_i32_e32 v1, v1, v5
+; GFX6-NEXT: v_min_i32_e32 v1, v1, v6
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x80000001
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
; GFX6-NEXT: v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GFX6-NEXT: v_min_i32_e32 v4, -1, v1
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4
; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
@@ -1260,7 +1261,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; GFX6-NEXT: v_max_i32_e32 v4, -1, v0
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4
; GFX6-NEXT: v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_min_i32_e32 v2, v2, v5
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
@@ -1279,7 +1281,8 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; GFX8-NEXT: v_max_i32_e32 v4, -1, v0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4
; GFX8-NEXT: v_min_i32_e32 v5, -1, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; GFX8-NEXT: v_max_i32_e32 v2, v4, v2
; GFX8-NEXT: v_min_i32_e32 v2, v2, v5
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 38ef707fa65a..3685eed5043a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -71,14 +71,14 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s5, 0xffff, s0
-; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: s_lshr_b32 s0, s5, 8
+; GFX9-NEXT: s_lshr_b32 s5, s5, 8
+; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: ds_write_b8 v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
-; GFX9-NEXT: s_lshr_b32 s0, s4, 8
+; GFX9-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -90,7 +90,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
-; GFX9-NEXT: s_lshr_b32 s1, s0, 8
+; GFX9-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
; GFX9-NEXT: v_mov_b32_e32 v0, s1
@@ -102,7 +102,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
-; GFX9-NEXT: s_lshr_b32 s1, s0, 8
+; GFX9-NEXT: s_lshr_b32 s1, s2, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
; GFX9-NEXT: v_mov_b32_e32 v0, s1
@@ -114,7 +114,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: ds_write_b8 v1, v0 offset:13
-; GFX9-NEXT: s_lshr_b32 s1, s0, 8
+; GFX9-NEXT: s_lshr_b32 s1, s3, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b8 v1, v0 offset:14
; GFX9-NEXT: v_mov_b32_e32 v0, s1
@@ -181,37 +181,37 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s1
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s0
+; GFX10-NEXT: s_lshr_b32 s0, s0, 24
+; GFX10-NEXT: s_and_b32 s7, 0xffff, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: s_lshr_b32 s1, s2, 16
-; GFX10-NEXT: s_and_b32 s7, 0xffff, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s2
-; GFX10-NEXT: s_lshr_b32 s2, s5, 8
+; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: s_lshr_b32 s5, s4, 8
-; GFX10-NEXT: s_lshr_b32 s4, s6, 8
-; GFX10-NEXT: s_lshr_b32 s6, s0, 8
+; GFX10-NEXT: s_lshr_b32 s1, s1, 24
+; GFX10-NEXT: s_lshr_b32 s8, s2, 16
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
+; GFX10-NEXT: s_lshr_b32 s5, s5, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
-; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
-; GFX10-NEXT: v_mov_b32_e32 v7, s5
-; GFX10-NEXT: v_mov_b32_e32 v8, s4
-; GFX10-NEXT: v_mov_b32_e32 v9, s6
+; GFX10-NEXT: v_mov_b32_e32 v6, s6
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
+; GFX10-NEXT: s_lshr_b32 s1, s9, 8
+; GFX10-NEXT: v_mov_b32_e32 v8, s5
+; GFX10-NEXT: v_mov_b32_e32 v9, s0
; GFX10-NEXT: ds_write_b8 v1, v0
; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
; GFX10-NEXT: ds_write_b8 v1, v4 offset:2
-; GFX10-NEXT: ds_write_b8 v1, v5 offset:6
-; GFX10-NEXT: ds_write_b8 v1, v6 offset:1
-; GFX10-NEXT: ds_write_b8 v1, v7 offset:3
-; GFX10-NEXT: ds_write_b8 v1, v8 offset:5
-; GFX10-NEXT: v_mov_b32_e32 v0, s1
-; GFX10-NEXT: v_mov_b32_e32 v10, s0
-; GFX10-NEXT: s_lshr_b32 s0, s1, 8
-; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
+; GFX10-NEXT: ds_write_b8 v1, v5 offset:3
+; GFX10-NEXT: ds_write_b8 v1, v6 offset:6
+; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
+; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
+; GFX10-NEXT: v_mov_b32_e32 v10, s1
+; GFX10-NEXT: s_lshr_b32 s0, s2, 24
+; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
@@ -221,7 +221,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: v_mov_b32_e32 v3, s0
-; GFX10-NEXT: s_lshr_b32 s0, s1, 8
+; GFX10-NEXT: s_lshr_b32 s0, s3, 24
; GFX10-NEXT: v_mov_b32_e32 v4, s1
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: ds_write_b8 v1, v0 offset:11
@@ -240,38 +240,37 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
; GFX11-NEXT: s_and_b32 s6, 0xffff, s0
; GFX11-NEXT: s_lshr_b32 s5, s0, 16
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s1
+; GFX11-NEXT: s_lshr_b32 s0, s0, 24
+; GFX11-NEXT: s_lshr_b32 s4, s1, 16
+; GFX11-NEXT: s_and_b32 s7, 0xffff, s1
+; GFX11-NEXT: s_lshr_b32 s6, s6, 8
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: s_lshr_b32 s1, s2, 16
-; GFX11-NEXT: s_and_b32 s7, 0xffff, s2
-; GFX11-NEXT: s_lshr_b32 s2, s6, 8
-; GFX11-NEXT: s_lshr_b32 s6, s5, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
+; GFX11-NEXT: s_lshr_b32 s1, s1, 24
; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0
-; GFX11-NEXT: s_lshr_b32 s4, s4, 8
-; GFX11-NEXT: s_lshr_b32 s5, s0, 8
; GFX11-NEXT: s_lshr_b32 s0, s7, 8
-; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
+; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s6
+; GFX11-NEXT: s_and_b32 s9, 0xffff, s2
+; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: s_lshr_b32 s0, s2, 24
+; GFX11-NEXT: s_lshr_b32 s1, s9, 8
; GFX11-NEXT: ds_store_b8 v1, v0
-; GFX11-NEXT: ds_store_b8 v1, v6 offset:1
+; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
-; GFX11-NEXT: ds_store_b8 v1, v7 offset:3
+; GFX11-NEXT: ds_store_b8 v1, v5 offset:3
; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
-; GFX11-NEXT: ds_store_b8 v1, v8 offset:5
-; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
-; GFX11-NEXT: ds_store_b8 v1, v9 offset:7
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3
-; GFX11-NEXT: s_lshr_b32 s0, s1, 8
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: ds_store_b8 v1, v9 offset:5
+; GFX11-NEXT: ds_store_b8 v1, v6 offset:6
+; GFX11-NEXT: ds_store_b8 v1, v8 offset:7
; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: s_and_b32 s0, 0xffff, s3
-; GFX11-NEXT: s_lshr_b32 s1, s3, 16
+; GFX11-NEXT: s_lshr_b32 s8, s2, 16
+; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v5, s3
; GFX11-NEXT: s_lshr_b32 s0, s0, 8
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX11-NEXT: s_lshr_b32 s0, s1, 8
+; GFX11-NEXT: s_lshr_b32 s1, s3, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v7, s1
+; GFX11-NEXT: v_mov_b32_e32 v6, s0
+; GFX11-NEXT: s_lshr_b32 s0, s3, 24
; GFX11-NEXT: v_mov_b32_e32 v8, s0
; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
; GFX11-NEXT: ds_store_b8 v1, v0 offset:9
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index 1d2d330eeb61..cce6bd9301cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -72,15 +72,15 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX9-NEXT: s_and_b32 s5, 0xffff, s0
-; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_lshr_b32 s0, s5, 8
+; GFX9-NEXT: s_lshr_b32 s3, s5, 8
+; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: ds_write_b8 v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
-; GFX9-NEXT: s_lshr_b32 s0, s4, 8
+; GFX9-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -92,7 +92,7 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
-; GFX9-NEXT: s_lshr_b32 s1, s0, 8
+; GFX9-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
; GFX9-NEXT: v_mov_b32_e32 v0, s1
@@ -104,7 +104,7 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
-; GFX9-NEXT: s_lshr_b32 s1, s0, 8
+; GFX9-NEXT: s_lshr_b32 s1, s2, 24
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
; GFX9-NEXT: v_mov_b32_e32 v0, s1
@@ -163,37 +163,37 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_and_b32 s5, 0xffff, s0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s1
+; GFX10-NEXT: s_lshr_b32 s0, s0, 24
+; GFX10-NEXT: s_lshr_b32 s3, s1, 16
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: s_lshr_b32 s1, s2, 16
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s2
-; GFX10-NEXT: s_lshr_b32 s2, s5, 8
-; GFX10-NEXT: s_lshr_b32 s5, s4, 8
+; GFX10-NEXT: s_lshr_b32 s1, s1, 24
+; GFX10-NEXT: s_and_b32 s8, 0xffff, s2
; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: s_lshr_b32 s4, s0, 8
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_lshr_b32 s0, s6, 8
-; GFX10-NEXT: v_mov_b32_e32 v9, s4
-; GFX10-NEXT: s_lshr_b32 s3, s3, 8
-; GFX10-NEXT: v_mov_b32_e32 v6, s2
-; GFX10-NEXT: v_mov_b32_e32 v10, s0
-; GFX10-NEXT: s_lshr_b32 s0, s1, 8
-; GFX10-NEXT: v_mov_b32_e32 v7, s5
-; GFX10-NEXT: v_mov_b32_e32 v8, s3
+; GFX10-NEXT: v_mov_b32_e32 v7, s1
+; GFX10-NEXT: s_lshr_b32 s1, s8, 8
+; GFX10-NEXT: s_lshr_b32 s7, s2, 16
+; GFX10-NEXT: v_mov_b32_e32 v3, s2
+; GFX10-NEXT: s_lshr_b32 s5, s5, 8
+; GFX10-NEXT: v_mov_b32_e32 v6, s3
+; GFX10-NEXT: v_mov_b32_e32 v9, s0
+; GFX10-NEXT: v_mov_b32_e32 v10, s1
+; GFX10-NEXT: s_lshr_b32 s0, s2, 24
+; GFX10-NEXT: v_mov_b32_e32 v8, s5
; GFX10-NEXT: ds_write_b8 v1, v0
; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
; GFX10-NEXT: ds_write_b8 v1, v4 offset:2
-; GFX10-NEXT: ds_write_b8 v1, v5 offset:6
-; GFX10-NEXT: ds_write_b8 v1, v6 offset:1
-; GFX10-NEXT: ds_write_b8 v1, v7 offset:3
-; GFX10-NEXT: ds_write_b8 v1, v8 offset:5
-; GFX10-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-NEXT: ds_write_b8 v1, v5 offset:3
+; GFX10-NEXT: ds_write_b8 v1, v6 offset:6
+; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
+; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
+; GFX10-NEXT: v_mov_b32_e32 v0, s7
; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
+; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
@@ -206,37 +206,37 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s0
+; GFX11-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-NEXT: s_lshr_b32 s5, s5, 8
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s1
+; GFX11-NEXT: s_lshr_b32 s0, s0, 24
+; GFX11-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: s_lshr_b32 s1, s2, 16
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s2
-; GFX11-NEXT: s_lshr_b32 s2, s5, 8
-; GFX11-NEXT: s_lshr_b32 s5, s4, 8
-; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, 24
+; GFX11-NEXT: s_and_b32 s8, 0xffff, s2
+; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v9, s5
; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0
-; GFX11-NEXT: s_lshr_b32 s3, s3, 8
-; GFX11-NEXT: s_lshr_b32 s4, s0, 8
+; GFX11-NEXT: s_lshr_b32 s2, s2, 24
; GFX11-NEXT: s_lshr_b32 s0, s6, 8
-; GFX11-NEXT: s_lshr_b32 s6, s1, 8
-; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3
-; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0
-; GFX11-NEXT: v_mov_b32_e32 v12, s6
+; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s1
+; GFX11-NEXT: s_lshr_b32 s1, s8, 8
+; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v12, s1
; GFX11-NEXT: ds_store_b8 v1, v0
-; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
+; GFX11-NEXT: ds_store_b8 v1, v9 offset:1
; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
-; GFX11-NEXT: ds_store_b8 v1, v8 offset:3
+; GFX11-NEXT: ds_store_b8 v1, v5 offset:3
; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
-; GFX11-NEXT: ds_store_b8 v1, v9 offset:5
-; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
-; GFX11-NEXT: ds_store_b8 v1, v10 offset:7
+; GFX11-NEXT: ds_store_b8 v1, v11 offset:5
+; GFX11-NEXT: ds_store_b8 v1, v6 offset:6
+; GFX11-NEXT: ds_store_b8 v1, v7 offset:7
; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
-; GFX11-NEXT: ds_store_b8 v1, v11 offset:9
-; GFX11-NEXT: ds_store_b8 v1, v6 offset:10
-; GFX11-NEXT: ds_store_b8 v1, v12 offset:11
+; GFX11-NEXT: ds_store_b8 v1, v12 offset:9
+; GFX11-NEXT: ds_store_b8 v1, v8 offset:10
+; GFX11-NEXT: ds_store_b8 v1, v10 offset:11
; GFX11-NEXT: s_endpgm
store <3 x i32> %x, ptr addrspace(3) %out, align 1
ret void